Beispiel #1
0
def main(args):
	global FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, tmpdir, usearch
	parser=argparse.ArgumentParser(prog='amptk-process_ion.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu",
		description='''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''',
		epilog="""Written by Jon Palmer (2015) [email protected]""",
		formatter_class=MyFormatter)

	parser.add_argument('-i','--fastq', dest='fastq', required=True, help='FASTQ R1 file')
	parser.add_argument('--reverse', help='Illumina R2 reverse reads')
	parser.add_argument('-o','--out', dest="out", default='illumina2', help='Base name for output')
	parser.add_argument('-f','--fwd_primer', dest="F_primer", default='fITS7', help='Forward Primer')
	parser.add_argument('-r','--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer')
	parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns')
	parser.add_argument('-p','--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length')
	parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer')
	parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode')
	parser.add_argument('--barcode_fasta', help='FASTA file containing Barcodes (Names & Sequences)')
	parser.add_argument('--barcode_not_anchored', action='store_true', help='Barcodes (indexes) are not at start of reads')
	parser.add_argument('--reverse_barcode', help='FASTA file containing 3 prime Barocdes')
	parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep')
	parser.add_argument('-l','--trim_len', default=300, type=int, help='Trim length for reads')
	parser.add_argument('--full_length', action='store_true', help='Keep only full length reads (no trimming/padding)')
	parser.add_argument('--merge_method', default='usearch', choices=['usearch', 'vsearch'], help='Software to use for PE read merging')
	parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto")
	parser.add_argument('-u','--usearch', dest="usearch", default='usearch9', help='USEARCH EXE')
	args=parser.parse_args(args)    

	args.out = re.sub(r'\W+', '', args.out)

	log_name = args.out + '.amptk-demux.log'
	if os.path.isfile(log_name):
		os.remove(log_name)
	FNULL = open(os.devnull, 'w')
	amptklib.setupLogging(log_name)
	cmd_args = " ".join(sys.argv)+'\n'
	amptklib.log.debug(cmd_args)
	print("-------------------------------------------------------")

	#initialize script, log system info and usearch version
	amptklib.SystemInfo()
	#Do a version check
	usearch = args.usearch
	amptklib.versionDependencyChecks(usearch)

	#get number of CPUs to use
	if not args.cpus:
		cpus = multiprocessing.cpu_count()
	else:
		cpus = args.cpus

	#parse a mapping file or a barcode fasta file, primers, etc get setup
	#dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
	barcode_file = args.out + ".barcodes_used.fa"
	rev_barcode_file = args.out + '.revbarcodes_used.fa'
	amptklib.SafeRemove(barcode_file)
	amptklib.SafeRemove(rev_barcode_file)

	#check if mapping file passed, use this if present, otherwise use command line arguments
	SampleData = {}
	Barcodes = {}
	RevBarcodes = {}
	FwdPrimer = ''
	RevPrimer = ''
	if args.mapping_file:
		if not os.path.isfile(args.mapping_file):
			amptklib.log.error("Mapping file not found: %s" % args.mapping_file)
			sys.exit(1)
		SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(args.mapping_file)  
	else: #no mapping file, so create dictionaries from barcode fasta files
		if not args.barcode_fasta:
			amptklib.log.error("You did not specify a --barcode_fasta or --mapping_file, one is required")
			sys.exit(1)
		else:
			shutil.copyfile(args.barcode_fasta, barcode_file)
			Barcodes = amptklib.fasta2barcodes(barcode_file, False)
			if args.reverse_barcode:
				shutil.copyfile(args.reverse_barcode, rev_barcode_file)
				RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, False)                   
	
		#parse primers here so doesn't conflict with mapping primers
		#look up primer db otherwise default to entry
		if args.F_primer in amptklib.primer_db:
			FwdPrimer = amptklib.primer_db.get(args.F_primer)
			amptklib.log.info("{:} fwd primer found in AMPtk primer db, setting to: {:}".format(args.F_primer, FwdPrimer))
		else:
			FwdPrimer = args.F_primer
			amptklib.log.info("{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.F_primer))
		if args.R_primer in amptklib.primer_db:
			RevPrimer = amptklib.primer_db.get(args.R_primer)
			amptklib.log.info("{:} rev primer found in AMPtk primer db, setting to: {:}".format(args.R_primer, RevPrimer))
		else:
			RevPrimer = args.R_primer
			amptklib.log.info("{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.R_primer))

	#check if input is compressed
	gzip_list = []
	if args.fastq.endswith('.gz'):
		gzip_list.append(os.path.abspath(args.fastq))
	if args.reverse:
		if args.reverse.endswith('.gz'):
			gzip_list.append(os.path.abspath(args.reverse))
	if gzip_list:
		amptklib.log.info("Gzipped input files detected, uncompressing")
		for file in gzip_list:
			file_out = file.replace('.gz', '')
			amptklib.Funzip(file, file_out, cpus)
		args.fastq = args.fastq.replace('.gz', '')
		if args.reverse:
			args.reverse = args.reverse.replace('.gz', '')

	#Count FASTQ records
	amptklib.log.info("Loading FASTQ Records")
	orig_total = amptklib.countfastq(args.fastq)
	size = amptklib.checkfastqsize(args.fastq)
	readablesize = amptklib.convertSize(size*2)
	amptklib.log.info('{:,} reads ({:})'.format(orig_total, readablesize))

	#output barcodes/samples
	amptklib.log.info('Searching for {:} forward barcodes and {:} reverse barcodes'.format(len(Barcodes), len(RevBarcodes)))

	#create tmpdir and split input into n cpus
	tmpdir = args.out.split('.')[0]+'_'+str(os.getpid())
	if not os.path.exists(tmpdir):
		os.makedirs(tmpdir)
	
	#tell user about number of cores using
	amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus))

	if args.reverse:
		amptklib.log.info("Demuxing PE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, RevPrimer))
	else:
		amptklib.log.info("Demuxing SE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, amptklib.RevComp(RevPrimer)))

	amptklib.log.info('Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'.format(args.min_len, args.trim_len))

	if cpus > 1:
		if args.reverse:
			amptklib.split_fastqPE(args.fastq, args.reverse, orig_total, tmpdir, cpus*4)
			file_list = []
			for file in os.listdir(tmpdir):
				if file.endswith('.fq'):
					filepart = os.path.join(tmpdir, file.split('_R')[0])
					if not filepart in file_list:
						file_list.append(filepart)
			amptklib.runMultiProgress(processReadsPE, file_list, cpus, args=args)               
		else:
			#split fastq file
			amptklib.split_fastq(args.fastq, orig_total, tmpdir, cpus*4)    
			#now get file list from tmp folder
			file_list = []
			for file in os.listdir(tmpdir):
				if file.endswith(".fq"):
					file = os.path.join(tmpdir, file)
					file_list.append(file)
			#finally process reads over number of cpus
			amptklib.runMultiProgress(processRead, file_list, cpus, args=args)
	else:
		if args.reverse:
			shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq'))
			shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq'))
			processReadsPE(os.path.join(tmpdir, 'chunk'), args=args)
		else:
			shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk.fq'))
			processRead(os.path.join(tmpdir, 'chunk.fq'), args=args)

	print("-------------------------------------------------------")
	#Now concatenate all of the demuxed files together
	amptklib.log.info("Concatenating Demuxed Files")

	tmpDemux = args.out + '.tmp.demux.fq'
	with open(tmpDemux, 'w') as outfile:
		for filename in glob.glob(os.path.join(tmpdir,'*.demux.fq')):
			if filename == tmpDemux:
				continue
			with open(filename, 'r') as readfile:
				shutil.copyfileobj(readfile, outfile)
	if args.reverse:
		#parse the stats
		finalstats = [0,0,0,0,0,0]
		for file in os.listdir(tmpdir):
			if file.endswith('.stats'):
				with open(os.path.join(tmpdir, file), 'r') as statsfile:
					line = statsfile.readline()
					line = line.rstrip()
					newstats = line.split(',')
					newstats = [int(i) for i in newstats]
					for x, num in enumerate(newstats):
						finalstats[x] += num
	
		amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads')
		amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[3])+' valid Barcodes')
		amptklib.log.info('{0:,}'.format(finalstats[5])+' valid output reads (Barcodes and Primers)')
	else:
		#parse the stats
		finalstats = [0,0,0,0,0,0,0]
		for file in os.listdir(tmpdir):
			if file.endswith('.stats'):
				with open(os.path.join(tmpdir, file), 'r') as statsfile:
					line = statsfile.readline()
					line = line.rstrip()
					newstats = line.split(',')
					newstats = [int(i) for i in newstats]
					for x, num in enumerate(newstats):
						finalstats[x] += num
			
		amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads')
		if args.reverse_barcode:
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2]-finalstats[4])+' valid Fwd and Rev Barcodes')
		else:
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1])+' valid Barcode')
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2])+' Fwd Primer found, {0:,}'.format(finalstats[3])+ ' Rev Primer found')
		amptklib.log.info('{0:,}'.format(finalstats[5])+' discarded too short (< %i bp)' % args.min_len)
		amptklib.log.info('{0:,}'.format(finalstats[6])+' valid output reads')


	#clean up tmp folder
	amptklib.SafeRemove(tmpdir)

	#last thing is to re-number of reads as it is possible they could have same name from multitprocessor split
	catDemux = args.out+'.demux.fq'
	amptklib.fastqreindex(tmpDemux, catDemux)
	amptklib.SafeRemove(tmpDemux)
	#now loop through data and find barcoded samples, counting each.....
	BarcodeCount = {}
	with open(catDemux, 'r') as input:
		header = itertools.islice(input, 0, None, 4)
		for line in header:
			ID = line.split("=",1)[-1].split(";")[0]
			if ID not in BarcodeCount:
				BarcodeCount[ID] = 1
			else:
				BarcodeCount[ID] += 1

	#now let's count the barcodes found and count the number of times they are found.
	barcode_counts = "%22s:  %s" % ('Sample', 'Count')
	for k,v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True):
		barcode_counts += "\n%22s:  %s" % (k, str(BarcodeCount[k]))
	amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts))

	genericmapfile = args.out + '.mapping_file.txt'
	if not args.mapping_file:
		#create a generic mappingfile for downstream processes
		amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer, RevPrimer, genericmapfile, BarcodeCount)
	else:
		amptklib.updateMappingFile(args.mapping_file, BarcodeCount, genericmapfile)
	#compress the output to save space
	FinalDemux = catDemux+'.gz'
	amptklib.Fzip(catDemux, FinalDemux, cpus)
	amptklib.removefile(catDemux)
	if gzip_list:
		for file in gzip_list:
			file = file.replace('.gz', '')
			amptklib.removefile(file)

	#get file size
	filesize = os.path.getsize(FinalDemux)
	readablesize = amptklib.convertSize(filesize)
	amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
	amptklib.log.info("Mapping file: %s" % genericmapfile)

	print("-------------------------------------------------------")
	if 'darwin' in sys.platform:
		print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux))
	else:
		print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))
Beispiel #2
0
def main(args):
    global FwdPrimer, RevPrimer, usearch
    parser = argparse.ArgumentParser(
        prog='amptk-process_illumina_folder.py',
        usage="%(prog)s [options] -i folder",
        description=
        '''Script that takes De-mulitplexed Illumina data from a folder and processes it for amptk (merge PE reads, strip primers, trim/pad to set length.''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--input',
                        dest='input',
                        required=True,
                        help='Folder of Illumina Data')
    parser.add_argument('-o',
                        '--out',
                        dest="out",
                        default='amptk-illumina',
                        help='Name for output folder')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument('--reads',
                        dest="reads",
                        default='paired',
                        choices=['paired', 'forward'],
                        help='PE or forward reads')
    parser.add_argument('--read_length',
                        type=int,
                        help='Read length, i.e. 2 x 300 bp = 300')
    parser.add_argument('-f',
                        '--fwd_primer',
                        dest="F_primer",
                        default='fITS7',
                        help='Forward Primer (fITS7)')
    parser.add_argument('-r',
                        '--rev_primer',
                        dest="R_primer",
                        default='ITS4',
                        help='Reverse Primer (ITS4)')
    parser.add_argument('--require_primer',
                        dest="primer",
                        default='on',
                        choices=['on', 'off'],
                        help='Require Fwd primer to be present')
    parser.add_argument('--primer_mismatch',
                        default=2,
                        type=int,
                        help='Number of mis-matches in primer')
    parser.add_argument('--barcode_mismatch',
                        default=1,
                        type=int,
                        help='Number of mis-matches allowed in index')
    parser.add_argument('--rescue_forward',
                        default='on',
                        choices=['on', 'off'],
                        help='Rescue Not-merged forward reads')
    parser.add_argument('--min_len',
                        default=100,
                        type=int,
                        help='Minimum read length to keep')
    parser.add_argument('--merge_method',
                        default='usearch',
                        choices=['usearch', 'vsearch'],
                        help='Software to use for PE read merging')
    parser.add_argument('-l',
                        '--trim_len',
                        default=300,
                        type=int,
                        help='Trim length for reads')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    parser.add_argument(
        '--full_length',
        action='store_true',
        help='Keep only full length reads (no trimming/padding)')
    parser.add_argument('-p',
                        '--pad',
                        default='off',
                        choices=['on', 'off'],
                        help='Pad with Ns to a set length')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH executable')
    parser.add_argument(
        '--sra',
        action='store_true',
        help='Input files are from NCBI SRA not direct from illumina')
    parser.add_argument('--cleanup',
                        action='store_true',
                        help='Delete all intermediate files')
    args = parser.parse_args(args)

    #sometimes people add slashes in the output directory, this could be bad, try to fix it
    args.out = re.sub(r'\W+', '', args.out)

    #create directory and check for existing logfile
    if not os.path.exists(args.out):
        os.makedirs(args.out)

    log_name = args.out + '.amptk-demux.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #get version of amptk
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #Now all the data is in folder args.out that needs to be de-multiplexed
    if not args.cpus:
        cpus = multiprocessing.cpu_count()
    else:
        cpus = args.cpus

    #check folder if files are gzipped, then gunzip them
    #try to gunzip files
    gzip_list = []
    for file in os.listdir(args.input):
        if file.endswith(".fastq.gz"):
            gzip_list.append(file)
    if gzip_list:
        amptklib.log.info("Gzipped files detected, uncompressing")
        for file in gzip_list:
            amptklib.log.debug("Uncompressing %s" % file)
            OutName = os.path.join(args.input, os.path.splitext(file)[0])
            amptklib.Funzip(os.path.join(args.input, file), OutName, cpus)

    #check for mapping file, if exists, then use names from first column only for filenames
    SampleData = {}
    Barcodes = {}
    RevBarcodes = {}
    FwdPrimer = ''
    RevPrimer = ''
    if args.mapping_file:
        if not os.path.isfile(args.mapping_file):
            amptklib.error("Mapping file is not valid: %s" % args.mapping_file)
            sys.exit(1)
        SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(
            args.mapping_file)
        mapdata = amptklib.parseMappingFileIllumina(args.mapping_file)
        #forward primer in first item in tuple, reverse in second
        sample_names = list(SampleData.keys())
        #loop through the files in the folder and get the ones in the sample_names lit
        filenames = []
        for file in os.listdir(args.input):
            if file.startswith(tuple(sample_names)):
                if file.endswith('.fastq'):
                    filenames.append(file)

        if len(filenames) < 1:
            amptklib.log.error(
                "Found 0 valid files from mapping file. Mapping file SampleID must match start of filenames"
            )
            sys.exit(1)

    else:  #if not then search through and find all the files you can in the folder
        '''get filenames, store in list, Illumina file names look like the following:
		<sample name>_<i5>-<i7>_L<lane (0-padded to 3 digits)>_R<read number>_<set number (0-padded to 3 digits>.fastq.gz'''

        #now get the FASTQ files and proceed
        filenames = []
        for file in os.listdir(args.input):
            if file.endswith(".fastq"):
                filenames.append(file)
        #look up primer db otherwise default to entry
        if args.F_primer in amptklib.primer_db:
            FwdPrimer = amptklib.primer_db.get(args.F_primer)
            amptklib.log.info(
                "{:} fwd primer found in AMPtk primer db, setting to: {:}".
                format(args.F_primer, FwdPrimer))
        else:
            FwdPrimer = args.F_primer
            amptklib.log.info(
                "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.F_primer))
        if args.R_primer in amptklib.primer_db:
            RevPrimer = amptklib.primer_db.get(args.R_primer)
            amptklib.log.info(
                "{:} rev primer found in AMPtk primer db, setting to: {:}".
                format(args.R_primer, RevPrimer))
        else:
            RevPrimer = args.R_primer
            amptklib.log.info(
                "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.R_primer))

    #if files are from SRA, then do something different as they are already merged
    if args.sra:
        #take list of filenames, move over to output folder
        sampleDict = {}
        fastq_for = []
        for x in filenames:
            rename = os.path.basename(x).split(".f", -1)[0]
            sampleDict[rename] = 'unknown'
            shutil.copyfile(os.path.join(args.input, x),
                            os.path.join(args.out, rename + '.fq'))
            fastq_for.append(os.path.join(args.out, rename + '.fq'))
        args.reads = 'forward'
    else:
        if len(filenames) % 2 != 0:
            print(
                "Check your input files, they do not seem to be properly paired"
            )
            sys.exit(1)
        #check list for files, i.e. they need to have _R1 and _R2 in the filenames, otherwise throw exception
        if not any('_R1' in x for x in filenames):
            amptklib.log.error(
                "Did not find valid FASTQ files.  Your files must have _R1 and _R2 in filename, rename your files and restart script."
            )
            sys.exit(1)
        uniq_names = []
        fastq_for = []
        fastq_rev = []
        sampleDict = {}
        map = args.out + '.filenames.txt'
        with open(map, 'w') as map_file:
            map_file.write("Name\t[i5]\t[i7]\tLane\tSet_num\n")
            for item in sorted(filenames):
                if '_R1' in item:
                    fastq_for.append(os.path.join(args.input, item))
                if '_R2' in item:
                    fastq_rev.append(os.path.join(args.input, item))
                column = item.split("_")
                if column[0] not in uniq_names:
                    uniq_names.append(column[0])
                    if "-" in column[1]:
                        barcode = column[1].split(
                            "-"
                        )  #looking here for the linker between i5 and i7 seqs
                        i5 = barcode[0]
                        i7 = barcode[1]
                        try:
                            map_file.write("%s\t%s\t%s\t%s\t%s\n" %
                                           (column[0], i5, i7, column[2],
                                            column[4].split(".", 1)[0]))
                        except IndexError:
                            amptklib.log.debug(
                                "Non-standard names detected, skipping mapping file"
                            )
                    else:
                        i5 = column[1]
                        i7 = "None"
                        try:
                            map_file.write("%s\t%s\t%s\t%s\t%s\n" %
                                           (column[0], i5, i7, column[2],
                                            column[4].split(".", 1)[0]))
                        except IndexError:
                            amptklib.log.debug(
                                "Non-standard names detected, skipping mapping file"
                            )
                    if i7 != "None":
                        sampleDict[column[0]] = i5 + '-' + i7
                    else:
                        sampleDict[column[0]] = i5

    if args.full_length and args.primer == 'off':
        amptklib.log.info(
            '--full_length is not compatible with --require_primer off, turning --full_length off'
        )
        args.full_length = False

    #tell user about number of cores using
    amptklib.log.info('Demuxing data using {:} cpus'.format(cpus))
    amptklib.log.info(
        'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'
        .format(args.min_len, args.trim_len))

    #zip read lists into a single list of tuples
    if args.reads == 'paired':
        amptklib.log.info(
            "Strip Primers and Merge PE reads. FwdPrimer: {:} RevPrimer: {:}".
            format(FwdPrimer, RevPrimer))
        readList = list(zip(fastq_for, fastq_rev))
        amptklib.runMultiProgress(safe_run, readList, cpus, args=args)
    else:
        amptklib.log.info(
            "Strip Primers. FwdPrimer: {:} RevPrimer: {:}".format(
                FwdPrimer, RevPrimer))
        amptklib.runMultiProgress(safe_run2, fastq_for, cpus, args=args)

    #cleanup to save space
    if gzip_list:
        for file in gzip_list:
            file = file.replace('.gz', '')
            amptklib.removefile(os.path.join(args.input, file))
    print("-------------------------------------------------------")
    #Now concatenate all of the demuxed files together
    amptklib.log.info("Concatenating Demuxed Files")

    catDemux = args.out + '.demux.fq'
    with open(catDemux, 'w') as outfile:
        for filename in glob.glob(os.path.join(args.out, '*.demux.fq')):
            if filename == catDemux:
                continue
            with open(filename, 'r') as readfile:
                shutil.copyfileobj(readfile, outfile)

    #parse the stats
    #(Total, ForPrimerFound, RevPrimerFound, multiHits, TooShort, ValidSeqs))
    finalstats = [0, 0, 0, 0, 0, 0]
    for file in os.listdir(args.out):
        if file.endswith('.stats'):
            with open(os.path.join(args.out, file), 'r') as statsfile:
                line = statsfile.readline()
                line = line.replace('\n', '')
                newstats = line.split(',')
                newstats = [int(i) for i in newstats]
                for x, num in enumerate(newstats):
                    finalstats[x] += num
    amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads')
    amptklib.log.info('{0:,}'.format(finalstats[1]) +
                      ' Fwd Primer found, {0:,}'.format(finalstats[2]) +
                      ' Rev Primer found')
    amptklib.log.info('{0:,}'.format(finalstats[3]) +
                      ' discarded Primer incompatibility')
    amptklib.log.info('{0:,}'.format(finalstats[4]) +
                      ' discarded too short (< %i bp)' % args.min_len)
    amptklib.log.info('{0:,}'.format(finalstats[5]) + ' valid output reads')

    #now loop through data and find barcoded samples, counting each.....
    BarcodeCount = {}
    with open(catDemux, 'r') as input:
        header = itertools.islice(input, 0, None, 4)
        for line in header:
            ID = line.split("=")[-1].split(";")[0]
            if ID not in BarcodeCount:
                BarcodeCount[ID] = 1
            else:
                BarcodeCount[ID] += 1

    #now let's count the barcodes found and count the number of times they are found.
    barcode_counts = "%30s:  %s" % ('Sample', 'Count')
    for k, v in natsorted(list(BarcodeCount.items()),
                          key=lambda k_v: k_v[1],
                          reverse=True):
        barcode_counts += "\n%30s:  %s" % (k, str(BarcodeCount[k]))
    amptklib.log.info("Found %i barcoded samples\n%s" %
                      (len(BarcodeCount), barcode_counts))

    genericmapfile = args.out + '.mapping_file.txt'
    if not args.mapping_file:
        #create a generic mappingfile for downstream processes
        amptklib.CreateGenericMappingFileIllumina(sampleDict, FwdPrimer,
                                                  RevPrimer, genericmapfile,
                                                  BarcodeCount)
    else:
        amptklib.updateMappingFile(args.mapping_file, BarcodeCount,
                                   genericmapfile)

    #compress the output to save space
    FinalDemux = catDemux + '.gz'
    amptklib.Fzip(catDemux, FinalDemux, cpus)
    amptklib.removefile(catDemux)

    #get file size
    filesize = os.path.getsize(FinalDemux)
    readablesize = amptklib.convertSize(filesize)
    amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
    amptklib.log.info("Mapping file: %s" % genericmapfile)
    if args.cleanup:
        shutil.rmtree(args.out)
    print("-------------------------------------------------------")
    if 'darwin' in sys.platform:
        print(col.WARN + "\nExample of next cmd: " + col.END +
              "amptk cluster -i %s -o out\n" % (FinalDemux))
    else:
        print("\nExample of next cmd: amptk cluster -i %s -o out\n" %
              (FinalDemux))
Beispiel #3
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-filter.py',
        description='''Script inspects output of amptk-OTU_cluster.py and 
		determines useful threshold for OTU output based on a spike-in 
		mock community.''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--otu_table',
                        required=True,
                        help='Input OTU table')
    parser.add_argument('-f',
                        '--fasta',
                        required=True,
                        help='Input OTUs (multi-fasta)')
    parser.add_argument('-b',
                        '--mock_barcode',
                        help='Barocde of Mock community')
    parser.add_argument('-p',
                        '--index_bleed',
                        help='Index Bleed filter. Default: auto')
    parser.add_argument('-t',
                        '--threshold',
                        default='max',
                        choices=['sum', 'max', 'top25', 'top10', 'top5'],
                        help='Threshold to use when calculating index-bleed')
    parser.add_argument(
        '-c',
        '--calculate',
        default='all',
        choices=['all', 'in'],
        help='Calculate index-bleed, if synthetic mock use all otherwise use in'
    )
    parser.add_argument('-s',
                        '--subtract',
                        default=0,
                        help='Threshold to subtract')
    parser.add_argument('-n',
                        '--normalize',
                        default='y',
                        choices=['y', 'n'],
                        help='Normalize OTU table prior to filtering')
    parser.add_argument('-m', '--mc', help='Multi-FASTA mock community')
    parser.add_argument(
        '-d',
        '--drop',
        nargs='+',
        help='samples to drop from table after index-bleed filtering')
    parser.add_argument('--ignore',
                        nargs='+',
                        help='Ignore OTUs during index-bleed')
    parser.add_argument('--delimiter',
                        default='tsv',
                        choices=['csv', 'tsv'],
                        help='Delimiter')
    parser.add_argument('--col_order',
                        nargs='+',
                        dest="col_order",
                        help='Provide space separated list')
    parser.add_argument('--keep_mock',
                        action='store_true',
                        help='Keep mock sample in OTU table (Default: False)')
    parser.add_argument('--show_stats',
                        action='store_true',
                        help='Show stats datatable STDOUT')
    parser.add_argument('--negatives',
                        nargs='+',
                        help='Negative Control Sample names')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('--min_reads_otu',
                        default=2,
                        type=int,
                        help='Minimum number of reads per OTU for experiment')
    parser.add_argument(
        '--min_samples_otu',
        default=1,
        type=int,
        help='Minimum number of samples per OTU for experiment')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    if not args.out:
        #get base name of files
        base = args.otu_table.split(".otu_table")[0]
    else:
        base = args.out

    #remove logfile if exists
    log_name = base + '.amptk-filter.log'
    amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #check if otu_table is empty
    amptklib.log.info("Loading OTU table: %s" % args.otu_table)
    check = os.stat(args.otu_table).st_size
    if check == 0:
        amptklib.log.error("Input OTU table is empty")
        sys.exit(1)
    #get the OTU header info (depending on how OTU table was constructed, this might be different, so find it as you need for indexing)
    with open(args.otu_table, 'r') as f:
        first_line = f.readline()
        OTUhead = first_line.split('\t')[0]

    if args.delimiter == 'csv':
        delim = str(',')
        ending = '.csv'
    elif args.delimiter == 'tsv':
        delim = str('\t')
        ending = '.txt'

    #setup outputs
    sorted_table = base + '.sorted' + ending
    normal_table_pct = base + '.normalized.pct' + ending
    normal_table_nums = base + '.normalized.num' + ending
    subtract_table = base + '.normalized.subtract' + ending
    filtered_table = base + '.normalized' + ending
    final_table = base + '.final' + ending
    final_binary_table = base + '.final.binary' + ending
    stats_table = base + '.stats' + ending

    #load OTU table into pandas DataFrame
    df = pd.read_csv(args.otu_table, sep='\t')
    df.set_index(OTUhead, inplace=True)
    headers = df.columns.values.tolist()
    if headers[-1] == 'taxonomy' or headers[-1] == 'Taxonomy':
        otuDict = df[headers[-1]].to_dict()
        del df[headers[-1]]
    else:
        otuDict = False

    #parse OTU table to get count data for each OTU
    AddCounts = {}
    OTUcounts = df.sum(1)
    for x in OTUcounts.index:
        AddCounts[x] = int(OTUcounts[x])

    #now add counts to fasta header
    FastaCounts = base + '.otus.counts.fa'
    OTU_tax = {}
    with open(FastaCounts, 'w') as outfile:
        with open(args.fasta, 'r') as infile:
            for rec in SeqIO.parse(infile, 'fasta'):
                if ';' in rec.id:  #this should mean there is taxonomy, so split it
                    ID = rec.id.split(';', 1)[0]
                    tax = rec.id.split(';', 1)[-1]
                    OTU_tax[ID] = tax
                    if ID in AddCounts:
                        count = AddCounts.get(ID)
                    else:
                        count = 0
                    outfile.write('>%s;size=%i\n%s\n' % (ID, count, rec.seq))
                else:  #no tax, just process
                    if rec.id in AddCounts:
                        count = AddCounts.get(rec.id)
                    else:
                        count = 0
                    outfile.write('>%s;size=%i\n%s\n' %
                                  (rec.id, count, rec.seq))

    amptklib.log.info(
        'OTU table contains {:,} samples, {:,} OTUs, and {:,} reads counts'.
        format(len(df.columns.values.tolist()), len(df.index),
               int(df.values.sum())))

    #setup output files/variables
    mock_out = base + '.mockmap.txt'

    if args.mock_barcode:  #if user passes a column name for mock
        #check if mock barcode is valid
        validBCs = df.columns.values.tolist()
        if not args.mock_barcode in validBCs:
            amptklib.log.error("%s not a valid barcode." % args.mock_barcode)
            amptklib.log.error("Valid barcodes: %s" % (' '.join(validBCs)))
            sys.exit(1)
        if args.col_order and not args.mock_barcode in args.col_order:
            amptklib.log.error("Error: %s not listed in --col_order." %
                               args.mock_barcode)
            sys.exit(1)
        #make sure there is a --mc passed here otherwise throw error
        if not args.mc:
            amptklib.log.error(
                "If using the -b,--barcode option you must specify a fasta file of mock community via the --mc option"
            )
            sys.exit(1)
        #get default mock community value
        if args.mc == "mock3":
            mock = os.path.join(parentdir, 'DB', 'amptk_mock3.fa')
        elif args.mc == "mock2":
            mock = os.path.join(parentdir, 'DB', 'amptk_mock2.fa')
        elif args.mc == "mock1":
            mock = os.path.join(parentdir, 'DB', 'amptk_mock1.fa')
        elif args.mc == "synmock":
            mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa')
        else:
            mock = os.path.abspath(args.mc)

        #open mock community fasta and count records
        mock_ref_count = amptklib.countfasta(mock)

        #load OTU lengths into dictionary
        SeqLength = amptklib.fastalen2dict(args.fasta)

        #map OTUs to mock community, this is fast enough, but running twice, first to get only top hit, then
        amptklib.log.info("Mapping OTUs to Mock Community (USEARCH)")
        cmd = [
            usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.65',
            '-db', FastaCounts, '-userout', mock_out, '-userfields',
            'query+target+id+ql+tl+alnlen+caln+mism+diffs', '-maxaccepts', '0',
            '-maxrejects', '0'
        ]
        amptklib.runSubprocess(cmd, amptklib.log)

        #generate dictionary for name change
        '''
		If args.calculate is set to all, that means the script is trying to measure a synthetic
		mock of some kind.  if that is the case, then chimeras are < 95% identical to mock members
		and variants would be hits in between, i.e 95% > but not the best hit.
		'''
        Results = {}
        errorrate = {}
        with open(mock_out, 'r') as map:
            for line in map:
                line = line.replace('\n', '')
                cols = line.split('\t')
                MockID = cols[0]
                hit = cols[1].split(';size=')
                otuID = hit[0]
                abundance = int(hit[1])
                pident = float(cols[2])
                length = int(cols[4])
                mism = int(cols[7])
                diffs = int(cols[8])
                score = abundance * pident * length
                if not otuID in errorrate:
                    errorrate[otuID] = [MockID, diffs]
                else:
                    olderror = errorrate.get(otuID)
                    if diffs < olderror[1]:
                        errorrate[otuID] = [MockID, diffs]
                if not MockID in Results:
                    Results[MockID] = [(otuID, abundance, pident, length, mism,
                                        diffs, score)]
                else:
                    Results[MockID].append(
                        (otuID, abundance, pident, length, mism, diffs, score))

        found_dict = {}
        chimeras = []
        variants = []
        missing = []
        for k, v in natsorted(list(Results.items())):
            besthit = []
            #v is a list of tuples of results, parse through to get best hit
            for y in v:
                if y[2] >= 97.0:
                    besthit.append(y)
                elif y[2] >= 95.0 and y[2] < 97.0:
                    if not y[0] in variants:
                        variants.append(y[0])
                else:
                    if not y[0] in chimeras:
                        chimeras.append(y[0])
            if len(besthit) > 0:
                besthit.sort(key=lambda x: x[1], reverse=True)
                best = sorted(besthit[:3], key=lambda x: x[6], reverse=True)
                found_dict[k] = best[0]
            else:
                missing.append(k)

        #make name change dict
        annotate_dict = {}
        seen = []
        for k, v in natsorted(list(found_dict.items())):
            ID = v[0].replace('_chimera', '')
            newID = k + '_pident=' + str(v[2]) + '_' + v[0]
            annotate_dict[ID] = newID
            if not v[0] in seen:
                seen.append(v[0])
        if args.calculate == 'all':
            chimeras = [x for x in chimeras if x not in seen]
            variants = [x for x in variants if x not in seen]
            for i in chimeras:
                annotate_dict[i] = i + '_suspect_mock_chimera'
            for x in variants:
                annotate_dict[x] = x + '_suspect_mock_variant'
        if len(missing) > 0:
            amptklib.log.info("%i mock missing: %s" %
                              (len(missing), ', '.join(missing)))
    else:
        otu_new = args.fasta

    #rename OTUs
    if args.mock_barcode:
        df.rename(index=annotate_dict, inplace=True)

    #sort the table
    df2 = df.reindex(index=natsorted(df.index))
    if not args.col_order:
        amptklib.log.info("Sorting OTU table naturally")
        df = df2.reindex(columns=natsorted(df2.columns))
    else:
        amptklib.log.info(
            "Sorting OTU table by user defined order (--col_order)")
        col_headers = args.col_order
        #check if all names in headers or not
        for i in col_headers:
            if not i in df2.columns.values:
                col_headers.remove(i)
        df = df2.reindex(columns=col_headers)
    SortedTable = df
    if otuDict:
        df['Taxonomy'] = pd.Series(otuDict)
        df.to_csv(sorted_table, sep=delim)
        del df['Taxonomy']
    else:
        df.to_csv(sorted_table, sep=delim)

    #get sums of columns
    fs = df.sum(axis=0)
    #fs.to_csv('reads.per.sample.csv')
    otus_per_sample_original = df[df > 0].count(axis=0, numeric_only=True)
    filtered = pd.DataFrame(df, columns=fs.index)
    filt2 = filtered.loc[(filtered != 0).any(1)]
    tos = filt2.sum(axis=1)
    fotus = tos[
        tos >= args.
        min_reads_otu]  #valid allele must be found atleast from than 2 times, i.e. no singletons
    if len(fotus.index) < len(tos.index):
        diff = len(tos.index) - len(fotus.index)
        amptklib.log.info(
            "Removing {:,} OTUs according to --min_reads_otu {:,}".format(
                diff, args.min_reads_otu))
    filt3 = pd.DataFrame(filt2, index=fotus.index)

    if args.normalize == 'y':
        #normalize the OTU table
        normal = filt3.truediv(fs)
        if otuDict:
            normal['Taxonomy'] = pd.Series(otuDict)
            normal.to_csv(normal_table_pct, sep=delim)
            del normal['Taxonomy']
        else:
            normal.to_csv(normal_table_pct, sep=delim)
        #normalize back to read counts, pretend 100,000 reads in each
        norm_round = np.round(normal.multiply(100000), decimals=0)
        if otuDict:
            norm_round['Taxonomy'] = pd.Series(otuDict)
            norm_round.to_csv(normal_table_nums, sep=delim)
            del norm_round['Taxonomy']
        else:
            norm_round.to_csv(normal_table_nums, sep=delim)
        amptklib.log.info(
            "Normalizing OTU table to number of reads per sample")
    else:
        norm_round = filt3

    if args.mock_barcode:
        #now calculate the index-bleed in both directions (into the mock and mock into the other samples)
        mock = []
        sample = []
        #get names from mapping
        for k, v in list(annotate_dict.items()):
            if not '_suspect_mock_' in v:
                mock.append(v)
        for i in norm_round.index:
            if not i in mock:
                sample.append(i)
        if args.ignore:
            mock = [x for x in mock if x not in args.ignore]
            sample = [x for x in sample if x not in args.ignore]
        #first calculate bleed out of mock community
        #slice normalized dataframe to get only mock OTUs from table
        mock_df = pd.DataFrame(norm_round, index=mock)
        #if there are samples to drop, make sure they aren't being used in this calculation
        if args.drop:
            mock_df.drop(args.drop, axis=1, inplace=True)
        #get total number of reads from mock OTUs from entire table
        total = np.sum(np.sum(mock_df, axis=None))
        #now drop the mock barcode sample
        mock_df.drop(args.mock_barcode, axis=1, inplace=True)
        #get number of reads that are result of bleed over
        bleed1 = np.sum(np.sum(mock_df, axis=None))
        #calculate rate of bleed by taking num reads bleed divided by the total
        bleed1max = bleed1 / float(total)

        #second, calculate bleed into mock community
        #get list of mock OTUs not found in any other sample -> these are likely chimeras
        mock_only = pd.DataFrame(norm_round,
                                 index=list(norm_round.index),
                                 columns=[args.mock_barcode])
        mock_OTUs_zeros = mock_only.loc[(mock_only == 0).all(axis=1)]
        theRest = [
            x for x in list(norm_round.columns.values)
            if x not in [args.mock_barcode]
        ]
        non_mocks = pd.DataFrame(norm_round, index=sample, columns=theRest)
        non_mock_zeros = non_mocks.loc[(non_mocks == 0).all(axis=1)]
        zeros = [
            x for x in list(non_mock_zeros.index)
            if x not in list(mock_OTUs_zeros.index)
        ]
        if len(zeros) > 0:
            amptklib.log.info(
                "Found {:,} mock chimeras (only in mock sample and not mapped to mock sequences) excluding from index-bleed calculation"
                .format(len(zeros)))
            amptklib.log.debug('{:}'.format(', '.join(zeros)))
        #now get updated list of samples, dropping chimeras
        samples_trimmed = [x for x in sample if x not in zeros]
        #slice the OTU table to get all OTUs that are not in mock community from the mock sample
        sample_df = pd.DataFrame(norm_round,
                                 index=samples_trimmed,
                                 columns=[args.mock_barcode])
        #get total number of reads that don't belong in mock
        bleed2 = np.sum(np.sum(sample_df, axis=None))
        #now pull the entire mock sample
        mock_sample = pd.DataFrame(norm_round, columns=[args.mock_barcode])
        #calcuate bleed into mock by taking num reads that don't belong divided by the total, so this is x% of bad reads in the mock
        bleed2max = bleed2 / float(np.sum(mock_sample.sum(axis=1)))
        #autocalculate the subtraction filter by taking the maximum value that doesn't belong
        subtract_num = max(sample_df.max())

        #get max values for bleed
        #can only use into samples measurement if not using synmock
        if args.calculate == 'all':
            if bleed1max > bleed2max:
                bleedfilter = math.ceil(bleed1max * 1000) / 1000
            else:
                bleedfilter = math.ceil(bleed2max * 1000) / 1000
            amptklib.log.info(
                "Index bleed, mock into samples: %f%%.  Index bleed, samples into mock: %f%%."
                % (bleed1max * 100, bleed2max * 100))
        else:
            bleedfilter = math.ceil(bleed2max * 1000) / 1000
            amptklib.log.info("Index bleed, samples into mock: %f%%." %
                              (bleed2max * 100))

    else:
        bleedfilter = args.index_bleed  #this is value needed to filter MiSeq, Ion is likely less, but shouldn't effect the data very much either way.

    if args.index_bleed:
        args.index_bleed = float(args.index_bleed)
        amptklib.log.info(
            "Overwriting auto detect index-bleed, setting to %f%%" %
            (args.index_bleed * 100))
        bleedfilter = args.index_bleed
    else:
        if bleedfilter:
            amptklib.log.info(
                "Will use value of %f%% for index-bleed OTU filtering." %
                (bleedfilter * 100))
        else:
            bleedfilter = 0  #no filtering if you don't pass -p or -b
            amptklib.log.info(
                "No spike-in mock (-b) or index-bleed (-p) specified, thus not running index-bleed filtering"
            )

    if bleedfilter > 0.05:
        amptklib.log.info(
            "Index bleed into samples is abnormally high (%f%%), if you have biological mock you should use `--calculate in`"
            % (bleedfilter * 100))

    #to combat barcode switching, loop through each OTU filtering out if less than bleedfilter threshold
    cleaned = []
    for row in norm_round.itertuples():
        result = [row[0]]
        if args.threshold == 'max':
            total = max(
                row[1:]
            )  #get max OTU count from table to calculate index bleed from.
        elif args.threshold == 'sum':
            total = sum(row[1:])
        elif args.threshold == 'top25':
            top = sorted(row[1:], key=int, reverse=True)
            topn = int(round(len(row[1:]) * 0.25))
            total = sum(top[:topn])
        elif args.threshold == 'top10':
            top = sorted(row[1:], key=int, reverse=True)
            topn = int(round(len(row[1:]) * 0.10))
            total = sum(top[:topn])
        elif args.threshold == 'top5':
            top = sorted(row[1:], key=int, reverse=True)
            topn = int(round(len(row[1:]) * 0.05))
            total = sum(top[:topn])
        sub = total * bleedfilter
        for i in row[1:]:
            if i < sub:
                i = 0
            result.append(i)
        cleaned.append(result)

    header = [OTUhead]
    for i in norm_round.columns:
        header.append(i)

    #create data frame of index bleed filtered results
    final = pd.DataFrame(cleaned, columns=header)
    final.set_index(OTUhead, inplace=True)

    if args.drop:  #if user has passed samples to drop, do it here, subtract drop list from Header
        amptklib.log.info("Dropping %i samples from table: %s" %
                          (len(args.drop), ', '.join(args.drop)))

        colsdrop = []
        for x in args.drop:
            if x in header:
                colsdrop.append(x)
        #now drop those columns
        final.drop(colsdrop, axis=1, inplace=True)

    if args.subtract != 'auto':
        subtract_num = int(args.subtract)
    else:
        try:
            subtract_num = int(subtract_num)
            amptklib.log.info("Auto subtract filter set to %i" % subtract_num)
        except NameError:
            subtract_num = 0
            amptklib.log.info(
                "Error: to use 'auto' subtract feature, provide a sample name to -b,--mock_barcode."
            )
    if subtract_num != 0:
        amptklib.log.info("Subtracting %i from OTU table" % subtract_num)
        sub = final.subtract(subtract_num)
        sub[sub < 0] = 0  #if negative, change to zero
        sub = sub.loc[~(sub == 0).all(axis=1)]
        sub = sub.astype(int)
        if otuDict:
            sub['Taxonomy'] = pd.Series(otuDict)
            sub.to_csv(subtract_table, sep=delim)
            del sub['Taxonomy']
        else:
            sub.to_csv(subtract_table, sep=delim)
        otus_if_sub = sub[sub > 0].count(axis=0, numeric_only=True)
        final = sub.astype(int)
    otus_per_sample = final[final > 0].count(axis=0, numeric_only=True)
    stats = pd.concat([fs, otus_per_sample_original, otus_per_sample], axis=1)
    stats.columns = ['reads per sample', 'original OTUs', 'final OTUs']
    stats.fillna(0, inplace=True)
    stats = stats.astype(int)
    if args.show_stats:
        print(stats.to_string())
    stats.to_csv(stats_table, sep=delim)
    #after all filtering, get list of OTUs in mock barcode
    if args.mock_barcode:
        mocks = final[args.mock_barcode]
        mocks = mocks.loc[~(mocks == 0)].astype(int)
        totalmismatches = 0
        totallength = 0
        chimera_count = 0
        variant_count = 0
        for otu in mocks.index:
            count = mocks[otu]
            if 'suspect_mock' in otu:
                if 'chimera' in otu:
                    chimera_count += 1
                if 'variant' in otu:
                    variant_count += 1
                otu = otu.split('_', 1)[0]
            else:
                otu = otu.split('_', -1)[-1]
            otu_length = SeqLength.get(otu)
            countlen = otu_length * count
            totallength += countlen
            if otu in errorrate:
                otu_diffs = errorrate.get(otu)[1]
                totaldiffs = otu_diffs * count
                totalmismatches += totaldiffs
            else:
                totalmismatches += countlen
        e_rate = totalmismatches / float(totallength) * 100
        amptklib.log.info(args.mock_barcode + ' sample has ' +
                          '{0:,}'.format(len(mocks)) + ' OTUS out of ' +
                          '{0:,}'.format(mock_ref_count) + ' expected; ' +
                          '{0:,}'.format(variant_count) + ' mock variants; ' +
                          '{0:,}'.format(chimera_count) +
                          ' mock chimeras; Error rate: ' +
                          '{0:.3f}%'.format(e_rate))

    if not args.keep_mock:
        try:
            final.drop(args.mock_barcode, axis=1, inplace=True)
        except:
            pass

    #drop OTUs that are now zeros through whole table
    final = final.loc[~(final == 0).all(axis=1)]
    final = final.astype(int)

    #output filtered normalized table
    if otuDict:
        final['Taxonomy'] = pd.Series(otuDict)
        final.to_csv(filtered_table, sep=delim)
        del final['Taxonomy']
    else:
        final.to_csv(filtered_table, sep=delim)

    #convert to binary
    final[final > 0] = 1

    #apply min_sample_otu here (most stringent filter, not sure I would use this unless you know what you are doing)
    los = final.sum(axis=1)
    fotus = los[los >= args.min_samples_otu]
    keep = fotus.index
    final2 = pd.DataFrame(final, index=keep)
    diff = len(final.index) - len(keep)
    if diff > 0:
        amptklib.log.info(
            'Dropped {:,} OTUs found in fewer than {:,} samples'.format(
                diff, args.min_samples_otu))

    #drop samples that don't have any OTUs after filtering
    final3 = final2.loc[:, (final2 != 0).any(axis=0)]
    final3 = final3.astype(int)

    #get the actual read counts from binary table
    merge = {}
    for index, row in final3.items():
        merge[index] = []
        for i in range(0, len(row)):
            if row[i] == 0:
                merge[index].append(row[i])
            else:
                merge[index].append(SortedTable[index][row.index[i]])

    FiltTable = pd.DataFrame(merge, index=list(final3.index))
    FiltTable.index.name = '#OTU ID'

    #order the filtered table
    #sort the table
    FiltTable2 = FiltTable.reindex(index=natsorted(FiltTable.index))
    if not args.col_order:
        FiltTable = FiltTable2.reindex(columns=natsorted(FiltTable2.columns))
    else:
        col_headers = args.col_order
        #check if all names in headers or not
        for i in col_headers:
            if not i in FiltTable2.columns.values:
                col_headers.remove(i)
        FiltTable = FiltTable2.reindex(columns=col_headers)

    #check for negative samples and how many OTUs are in these samples
    #if found, filter the OTUs and alert user to rebuild OTU table, I could do this automatically, but would then require
    #there to be reads passed to this script which seems stupid.  Just deleting the OTUs is probably not okay....
    if args.negatives:
        if len(args.negatives
               ) > 1:  #if greater than 1 then assuming list of sample names
            Neg = args.negatives
        else:
            if os.path.isfile(
                    args.negatives[0]):  #check if it is a file or not
                Neg = []
                with open(args.negatives[0], 'r') as negfile:
                    for line in negfile:
                        line = line.replace('\n', '')
                        Neg.append(line)
            else:
                Neg = args.negatives
        #Now slice the final OTU table, check if values are valid
        NotFound = []
        for i in Neg:
            if not i in FiltTable.columns.values:
                Neg.remove(i)
                NotFound.append(i)
        if len(NotFound) > 0:
            amptklib.log.info('Samples not found: %s' % ' '.join(NotFound))
        #slice table
        NegTable = FiltTable.reindex(columns=Neg)
        #drop those that are zeros through all samples, just pull out OTUs found in the negative samples
        NegTable = NegTable.loc[~(NegTable == 0).all(axis=1)]
        NegOTUs = list(NegTable.index)
        #now make sure you aren't dropping mock OTUs as you want to keep those for filtering new OTU table
        NegOTUs = [item for item in NegOTUs if item not in mock]
    else:
        NegOTUs = []

    #check if negative OTUs exist, if so, then output updated OTUs and instructions on creating new OTU table
    if len(NegOTUs) > 0:
        amptklib.log.info("%i OTUs are potentially contamination" %
                          len(NegOTUs))
        otu_clean = base + '.cleaned.otus.fa'
        with open(otu_clean, 'w') as otu_update:
            with open(args.fasta, "rU") as myfasta:
                for rec in SeqIO.parse(myfasta, 'fasta'):
                    if not rec.id in NegOTUs:
                        SeqIO.write(rec, otu_update, 'fasta')
        amptklib.log.info("Cleaned OTUs saved to: %s" % otu_clean)
        amptklib.log.info(
            "Generate a new OTU table like so:\namptk remove -i %s --format fasta -l %s -o %s\nvsearch --usearch_global %s --db %s --strand plus --id 0.97 --otutabout newOTU.table.txt\n"
            % (base + '.demux.fq', ' '.join(Neg), base + '.cleaned.fa',
               base + '.cleaned.fa', otu_clean))

    else:  #proceed with rest of script
        #output final table
        if otuDict:
            FiltTable['Taxonomy'] = pd.Series(otuDict)
            FiltTable.to_csv(final_table, sep=delim)
            del FiltTable['Taxonomy']
        else:
            FiltTable.to_csv(final_table, sep=delim)
        finalSamples = FiltTable.columns.values.tolist()
        if 'Taxonomy' in finalSamples:
            numFinalSamples = len(finalSamples) - 1
        else:
            numFinalSamples = len(finalSamples)
        amptklib.log.info(
            'Filtered OTU table contains {:,} samples, {:,} OTUs, and {:,} read counts'
            .format(numFinalSamples, len(FiltTable.index),
                    FiltTable.values.sum()))
        if numFinalSamples < len(df.columns.values.tolist()):
            diffSamples = [
                item for item in headers
                if item not in FiltTable.columns.values.tolist()
            ]
            amptklib.log.info('Samples dropped: %s' % (','.join(diffSamples)))
        #output binary table
        if otuDict:
            final3['Taxonomy'] = pd.Series(otuDict)
            final3.to_csv(final_binary_table, sep=delim)
        else:
            final3.to_csv(final_binary_table, sep=delim)

        #generate final OTU list for taxonomy
        amptklib.log.info("Finding valid OTUs")
        otu_new = base + '.filtered.otus.fa'
        with open(otu_new, 'w') as otu_update:
            with open(args.fasta, "rU") as myfasta:
                for rec in SeqIO.parse(myfasta, 'fasta'):
                    if ';' in rec.id:
                        rec.id = rec.id.split(';', 1)[0]
                    if args.mock_barcode:
                        #map new names of mock
                        if rec.id in annotate_dict:
                            newname = annotate_dict.get(rec.id)
                            rec.id = newname
                            rec.description = ''
                    if rec.id in final3.index:
                        if rec.id in OTU_tax:
                            otu_update.write(
                                '>%s;%s\n%s\n' %
                                (rec.id, OTU_tax.get(rec.id), rec.seq))
                        else:
                            otu_update.write('>%s\n%s\n' % (rec.id, rec.seq))

        #tell user what output files are
        print("-------------------------------------------------------")
        print("OTU Table filtering finished")
        print("-------------------------------------------------------")
        print("OTU Table Stats:      %s" % stats_table)
        print("Sorted OTU table:     %s" % sorted_table)
        if not args.debug:
            for i in [
                    normal_table_pct, normal_table_nums, subtract_table,
                    mock_out, FastaCounts
            ]:
                amptklib.removefile(i)
        else:
            print("Normalized (pct):     %s" % normal_table_pct)
            print("Normalized (10k):     %s" % normal_table_nums)
            if args.subtract != 0:
                print("Subtracted table:     %s" % subtract_table)
        print("Normalized/filter:    %s" % filtered_table)
        print("Final Binary table:   %s" % final_binary_table)
        print("Final OTU table:      %s" % final_table)
        print("Filtered OTUs:        %s" % otu_new)
        print("-------------------------------------------------------")

        if 'darwin' in sys.platform:
            print(colr.WARN + "\nExample of next cmd:" + colr.END +
                  " amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" %
                  (otu_new, final_table))
        else:
            print(
                "\nExample of next cmd: amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n"
                % (otu_new, final_table))
Beispiel #4
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-OTU_cluster_ref.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UPARSE OTU clustering.
		Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="OTU Clustering Percent")
    parser.add_argument('--id', default='97', help="Threshold for alignment")
    parser.add_argument('-m',
                        '--minsize',
                        default='2',
                        help='Min identical seqs to process')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument(
        '-d',
        '--db',
        required=True,
        help='Reference Database [ITS,ITS1,ITS2,16S,LSU,COI,custom]')
    parser.add_argument('--utax_db', help='UTAX Reference Database')
    parser.add_argument('--utax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='UTAX confidence value threshold.')
    parser.add_argument('--utax_level',
                        default='k',
                        choices=['k', 'p', 'c', 'o', 'f', 'g', 's'],
                        help='UTAX classification level to retain')
    parser.add_argument('--mock',
                        default='synmock',
                        help='Spike-in mock community (fasta)')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--closed_ref_only',
                        action='store_true',
                        help='Only run closed reference clustering')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    taxonomyLookup = {
        'k': 'Kingdom',
        'p': 'Phylum',
        'c': 'Class',
        'o': 'Order',
        'f': 'Family',
        'g': 'Genus',
        's': 'Species'
    }

    #remove logfile if exists
    log_name = base + '.amptk-cluster_ref.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Setup DB locations and names, etc
    DBdir = os.path.join(parentdir, 'DB')
    DataBase = {
        'ITS1':
        (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb')),
        'ITS2':
        (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb')),
        'ITS': (os.path.join(DBdir,
                             'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb')),
        '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir,
                                                             '16S.udb')),
        'LSU': (os.path.join(DBdir,
                             'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb')),
        'COI': (os.path.join(DBdir,
                             'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'))
    }

    #setup refDB
    amptklib.log.info("Checking Reference Database")
    if args.db in DataBase:
        #need to write to fasta from vsearch UDB
        DB = os.path.join(tmp, args.db + '.extracted.fa')
        cmd = [
            'vsearch', '--udb2fasta',
            DataBase.get(args.db)[0], '--output', DB
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
    else:
        DB = os.path.abspath(args.db)
    refDB = os.path.join(tmp, 'reference_DB.fa')
    if args.mock:
        if args.mock == 'synmock':
            mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa')
        else:
            mock = os.path.abspath(args.mock)
    seen = []
    with open(refDB, 'w') as output:
        if args.mock:
            with open(mock) as input1:
                for rec in SeqIO.parse(input1, 'fasta'):
                    if not rec.id in seen:
                        SeqIO.write(rec, output, 'fasta')
                    else:
                        amptklib.log.error(
                            "Duplicate ID's in Ref DB: %s, exiting" % rec.id)
                        sys.exit(1)
        with open(DB) as input2:
            for rec in SeqIO.parse(input2, 'fasta'):
                if not rec.id in seen:
                    SeqIO.write(rec, output, 'fasta')
                else:
                    amptklib.log.error(
                        "Duplicate ID's in Ref DB: %s, exiting" % rec.id)
                    sys.exit(1)

    #get utax_database
    if args.db in DataBase:
        utaxDB = DataBase.get(args.db)[1]
    else:
        if not args.closed_ref_only:
            if args.utax_db:
                utaxDB = os.path.abspath(args.utax_db)
            else:
                amptklib.log.error(
                    "%s not pre-installed DB, must then also specify valid UTAX database via --utax_db"
                    % args.db)
                sys.exit(1)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    qtrimtotal = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output',
        derep_out, '--threads',
        str(cpus), '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run sort by size
    sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa')
    amptklib.log.info(
        "Sorting reads by size: removing reads seen less than %s times" %
        args.minsize)
    cmd = [
        'vsearch', '--sortbysize', derep_out, '--minsize', args.minsize,
        '--output', sort_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(sort_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #chimera detection
    #first run through de novo chimera detection
    amptklib.log.info("De novo chimera detection (VSEARCH)")
    chimera_out = os.path.join(tmp,
                               base + '.EE' + args.maxee + '.chimera_check.fa')
    cmd = [
        'vsearch', '--uchime_denovo', sort_out, '--relabel', 'Seq',
        '--sizeout', '--nonchimeras', chimera_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(chimera_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run uchime_ref
    uchime_out = os.path.join(tmp,
                              base + '.EE' + args.maxee + '.uchime.otus.fa')
    #now run chimera filtering if all checks out
    amptklib.log.info("Chimera Filtering (VSEARCH)")
    cmd = [
        'vsearch', '--mindiv', '1.0', '--uchime_ref', chimera_out, '--db',
        refDB, '--sizeout', '--nonchimeras', uchime_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(uchime_out)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs passed')

    #now run usearch_global versus reference database
    align_out = os.path.join(tmp, base + '.align.uc')
    pident = int(args.id) * 0.01
    amptklib.log.info(
        "Reference Clustering using Global Alignment, %s%% identity" % args.id)
    cmd = [
        'vsearch', '--usearch_global', uchime_out, '--db', refDB, '--id',
        str(pident), '--output_no_hits', '--top_hits_only', '--notrunclabels',
        '--uc', align_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #parse results
    ref_results = {}
    nohits = []
    with open(align_out, 'r') as alignment:
        for line in alignment:
            line = line.replace('\n', '')
            col = line.split('\t')
            counts = col[8].split(';')
            counts = int(counts[1].replace('size=', ''))
            if col[3] == '*':
                nohits.append(col[8])
                continue
            if float(col[3]) >= float(args.id):
                if not col[8] in ref_results:
                    ref_results[col[8]] = (col[9], col[3], counts)
                else:
                    print("Error: %s duplicated ID" % col[8])
            else:
                nohits.append(col[8])

    #summarize results from first ref clustering
    num_refcluster = len(ref_results)
    seqs_refcluster = 0
    for k, v in list(ref_results.items()):
        seqs_refcluster += v[2]
    amptklib.log.info("%i OTUs classified " % num_refcluster +
                      "({0:.0f}%".format(seqs_refcluster / float(qtrimtotal) *
                                         100) + " of reads)")

    #get ref clustered hits to file with taxonomy
    ref_clustered = os.path.join(tmp, base + '.ref_clustered.fa')
    with open(ref_clustered, 'w') as refoutput:
        with open(uchime_out, 'r') as input:
            otu_counter = 1
            for rec in SeqIO.parse(input, 'fasta'):
                if rec.id in ref_results:
                    res = ref_results.get(rec.id)
                    pident = res[1]
                    tax = res[0]
                    newID = 'OTU' + str(
                        otu_counter) + ';pident=' + pident + ';' + tax
                    rec.id = newID
                    rec.name = ''
                    rec.description = ''
                    SeqIO.write(rec, refoutput, 'fasta')
                    otu_counter += 1

    if not args.closed_ref_only:
        #get nohits file to run clustering
        utax_ref = os.path.join(tmp,
                                base + '.EE' + args.maxee + '.utax_ref.fa')
        with open(utax_ref, 'w') as output:
            with open(uchime_out, 'r') as input:
                for rec in SeqIO.parse(input, 'fasta'):
                    if rec.id in nohits:
                        SeqIO.write(rec, output, 'fasta')

        #input needs to be sorted, so
        ref_sort = os.path.join(tmp, base + '.utax_ref.sorted.fa')
        cmd = [
            'vsearch', '--sortbysize', utax_ref, '--minsize', args.minsize,
            '--output', ref_sort, '--threads',
            str(cpus)
        ]
        amptklib.runSubprocess(cmd, amptklib.log)

        #now run clustering algorithm on those not found in reference database
        radius = str(100 - int(args.pct_otu))
        otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa')
        amptklib.log.info("De novo Clustering remaining sequences (UPARSE)")
        cmd = [
            usearch, '-cluster_otus', ref_sort, '-relabel', 'OTU',
            '-otu_radius_pct', radius, '-otus', otu_out
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        total = amptklib.countfasta(otu_out)
        amptklib.log.info('{0:,}'.format(total) + ' de novo OTUs')

        #try utax reference clustering
        amptklib.log.info("Reference Clustering de novo OTUs using UTAX")
        cmd = [
            usearch, '-cluster_otus_utax', otu_out, '-db', utaxDB,
            '-utax_cutoff',
            str(args.utax_cutoff), '-utax_level', 's', '-strand', 'plus',
            '-utaxout',
            os.path.join(tmp, base + '.utax.out')
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        #setup tax filtering
        tax_values = ['k', 'p', 'c', 'o', 'f', 'g', 's']
        filter_index = tax_values.index(args.utax_level)
        filt_tax_values = [s + ':' for s in tax_values[filter_index:]]
        #get results from utax
        with open(ref_clustered, 'a') as output:
            seqDict = SeqIO.index(otu_out, 'fasta')
            utaxresults = []
            with open(os.path.join(tmp, base + '.utax.out'), 'r') as utax:
                for line in utax:
                    line = line.replace('\n', '')
                    col = line.split('\t')
                    ID = col[0]
                    tax = col[2]
                    if any(x in tax for x in filt_tax_values):
                        record = seqDict[ID]
                        record.id = 'OTU' + str(
                            otu_counter) + ';UTAX;tax=' + tax
                        record.name = ''
                        record.description = ''
                        SeqIO.write(record, output, 'fasta')
                        otu_counter += 1
        total = amptklib.countfasta(ref_clustered) - num_refcluster
        amptklib.log.info('{0:,}'.format(total) + ' classified to %s' %
                          taxonomyLookup.get(args.utax_level))

    #clean up padded N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.clean.otus.fa')
    amptklib.fasta_strip_padding(ref_clustered, otu_clean)
    total = amptklib.countfasta(otu_clean)
    amptklib.log.info('{0:,}'.format(total) + ' total OTUs')

    #now map reads back to OTUs
    uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(otu_clean, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)

    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("OTU Clustering Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Beispiel #5
0
def main(args):
	parser=argparse.ArgumentParser(prog='amptk-fastq2sra.py', usage="%(prog)s [options] -i folder",
		description='''Script to split FASTQ file from Ion, 454, or Illumina by barcode sequence into separate files for submission to SRA.  This script can take the BioSample worksheet from NCBI and create an SRA metadata file for submission.''',
		epilog="""Written by Jon Palmer (2015) [email protected]""",
		formatter_class=MyFormatter)
	parser.add_argument('-i','--input', dest='FASTQ', required=True, help='Input FASTQ file or folder')
	parser.add_argument('-o','--out', dest='out', help='Basename for output folder/files')
	parser.add_argument('--min_len', default=50, type=int, help='Minimum length of read to keep')
	parser.add_argument('-b','--barcode_fasta', help='Multi-fasta file containing barcodes used')
	parser.add_argument('--reverse_barcode', help='Reverse barcode fasta file')
	parser.add_argument('-s','--biosample', dest='biosample', help='BioSample file from NCBI')
	parser.add_argument('-p','--platform', dest='platform', default='ion', choices=['ion', 'illumina', '454'], help='Sequencing platform')
	parser.add_argument('-f','--fwd_primer', dest="F_primer", default='fITS7', help='Forward Primer (fITS7)')
	parser.add_argument('-r','--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer (ITS4)')
	parser.add_argument('-n', '--names', help='CSV mapping file BC,NewName')
	parser.add_argument('-d', '--description', help='Paragraph description for SRA metadata')
	parser.add_argument('-t','--title', default='Fungal ITS', help='Start of title for SRA submission, name it according to amplicon')
	parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns')
	parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer')
	parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode')
	parser.add_argument('--require_primer', default='off', choices=['forward', 'both', 'off'], help='Require Primers to be present')
	parser.add_argument('--force', action='store_true', help='Overwrite existing directory')
	parser.add_argument('-a','--append', help='Append a name to all sample names for a run, i.e. --append run1 would yield Sample_run1')
	args=parser.parse_args(args)

	#get basename if not args.out passed
	if args.out:
		base = args.out
	else:
		if 'demux' in args.FASTQ:
			base = os.path.basename(args.FASTQ).split('.demux')[0]
		else:
			base = os.path.basename(args.FASTQ).split('.f')[0]


	log_name = base + '.amptk-sra.log'
	if os.path.isfile(log_name):
		os.remove(log_name)

	amptklib.setupLogging(log_name)
	FNULL = open(os.devnull, 'w')
	cmd_args = " ".join(sys.argv)+'\n'
	amptklib.log.debug(cmd_args)
	print("-------------------------------------------------------")
	amptklib.SystemInfo()

	amptkversion = amptklib.get_version()

	#create output directory
	if not os.path.exists(base):
		os.makedirs(base)
	else:
		if not args.force:
			amptklib.log.error("Directory %s exists, add --force argument to overwrite" % base)
			sys.exit(1)
		else:
			shutil.rmtree(base)
			os.makedirs(base)

	#parse a mapping file or a barcode fasta file, primers, etc get setup
	#dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
	barcode_file = os.path.join(base, base + ".barcodes_used.fa")
	rev_barcode_file = os.path.join(base, base + ".revbarcodes_used.fa")
	if os.path.isfile(barcode_file):
		os.remove(barcode_file)

	#check if mapping file passed, use this if present, otherwise use command line arguments
	SampleData = {}
	Barcodes = {}
	RevBarcodes = {}
	FwdPrimer = ''
	RevPrimer = ''
	if args.mapping_file:
		if not os.path.isfile(args.mapping_file):
			amptklib.log.error("Mapping file not found: %s" % args.mapping_file)
			sys.exit(1)
		SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(args.mapping_file)  
	else:
		if args.barcode_fasta:
			with open(barcode_file, 'w') as barcodeout:
				with open(args.barcode_fasta, 'r') as input:
					for rec in SeqIO.parse(input, 'fasta'):
						outname = args.multi+'.'+rec.id
						barcodeout.write(">%s\n%s\n" % (outname, rec.seq))
		if args.reverse_barcode:
			with open(rev_barcode_file, 'w') as barcodeout:
				with open(args.reverse_barcode, 'r') as input:
					for rec in SeqIO.parse(input, 'fasta'):
						outname = args.multi+'.'+rec.id
						barcodeout.write(">%s\n%s\n" % (outname, rec.seq))                   
	
	#parse primers here so doesn't conflict with mapping primers
	#look up primer db otherwise default to entry
	if FwdPrimer == '':
		if args.F_primer in amptklib.primer_db:
			FwdPrimer = amptklib.primer_db.get(args.F_primer)
			amptklib.log.info("{:} fwd primer found in AMPtk primer db, setting to: {:}".format(args.F_primer, FwdPrimer))
		else:
			FwdPrimer = args.F_primer
			amptklib.log.info("{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.F_primer))
	if RevPrimer == '':
		if args.R_primer in amptklib.primer_db:
			RevPrimer = amptklib.primer_db.get(args.R_primer)
			amptklib.log.info("{:} rev primer found in AMPtk primer db, setting to: {:}".format(args.R_primer, RevPrimer))
		else:
			RevPrimer = args.R_primer
			amptklib.log.info("{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.R_primer))


	#then setup barcode dictionary
	if len(Barcodes) < 1 and os.path.isfile(barcode_file):
		Barcodes = amptklib.fasta2barcodes(barcode_file, False)

	#setup for looking for reverse barcode
	if len(RevBarcodes) < 1 and args.reverse_barcode:
		if not os.path.isfile(args.reverse_barcode):
			amptklib.log.info("Reverse barcode is not a valid file, exiting")
			sys.exit(1) 
		shutil.copyfile(args.reverse_barcode, rev_barcode_file)
		RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, True)


	if args.platform != 'illumina':
		if not args.mapping_file and not args.barcode_fasta:
			amptklib.log.error("For ion, 454, or illumina2 datasets you must specificy a multi-fasta file containing barcodes with -b, --barcode_fasta, or -m/--mapping_file")
			sys.exit(1)

	if args.platform == 'illumina':
		#just need to get the correct .fastq.gz files into a folder by themselves
		#if illumina is selected, verify that args.fastq is a folder
		if not os.path.isdir(args.FASTQ):
			amptklib.log.error("%s is not a folder, for '--platform illumina', -i must be a folder containing raw reads" % (args.FASTQ))
			sys.exit(1)
		rawlist = []
		filelist = []
		for file in os.listdir(args.FASTQ):
			if file.endswith(".fastq.gz") or file.endswith('.fastq') or file.endswith('.fq'):
				rawlist.append(file)
		if len(rawlist) > 0:
			if not '_R2' in sorted(rawlist)[1]:
				amptklib.log.info("Found %i single files, copying to %s folder" % (len(rawlist), base))
				filelist = rawlist
				for file in rawlist:
					shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(base,file)))
			else:
				amptklib.log.info("Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, base))
				for file in rawlist:
					shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(base,file)))
					if '_R1' in file:
						filelist.append(file)

	else:
		#start here to process the reads, first reverse complement the reverse primer
		ReverseCompRev = amptklib.RevComp(RevPrimer)

		#if --names given, load into dictonary
		if args.names:
			amptklib.log.info("Parsing names for output files via %s" % args.names)
			namesDict = {}
			with open(args.names, 'r') as input:
				for line in input:
					line = line.replace('\n', '')
					cols = line.split(',')
					if not cols[0] in namesDict:
						namesDict[cols[0]] = cols[1]
	
		#check for compressed input file
		if args.FASTQ.endswith('.gz'):
			amptklib.log.info("Gzipped input files detected, uncompressing")
			FASTQ_IN = args.FASTQ.replace('.gz', '')
			amptklib.Funzip(args.FASTQ, FASTQ_IN, multiprocessing.cpu_count())
		else:
			FASTQ_IN = args.FASTQ
   
		#count FASTQ records in input
		amptklib.log.info("Loading FASTQ Records")
		total = amptklib.countfastq(FASTQ_IN)
		size = amptklib.checkfastqsize(args.FASTQ)
		readablesize = amptklib.convertSize(size)
		amptklib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')
	
		#output message depending on primer requirement
		if args.require_primer == 'off':   
			amptklib.log.info("Looking for %i barcodes" % (len(Barcodes)))
		elif args.require_primer == 'forward':
			amptklib.log.info("Looking for %i barcodes that must have FwdPrimer: %s" % (len(Barcodes), FwdPrimer))
		elif args.require_primer == 'both':
			amptklib.log.info("Looking for %i barcodes that must have FwdPrimer: %s and RevPrimer: %s" % (len(Barcodes), FwdPrimer, RevPrimer))
	
		#this will loop through FASTQ file once, splitting those where barcodes are found, and primers trimmed
		runningTotal = 0
		with open(FASTQ_IN, 'r') as input:
			for title, seq, qual in FastqGeneralIterator(input):
				Barcode, BarcodeLabel = amptklib.AlignBarcode(seq, Barcodes, args.barcode_mismatch)
				if Barcode == "":
					continue
				#trim barcode from sequence
				BarcodeLength = len(Barcode)
				seq = seq[BarcodeLength:]
				qual = qual[BarcodeLength:]
				#look for forward primer
				if args.require_primer != 'off': #means we only want ones with forward primer and or reverse, but don't remove them             
					#now search for forward primer
					foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc)
					if foralign["editDistance"] < 0:
						continue
					if args.require_primer == 'both': 
						#now search for reverse primer
						revalign = edlib.align(ReverseCompRev, seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc)
						if revalign["editDistance"] < 0:  #reverse primer was not found
							continue         
				#check size
				if len(seq) < args.min_len: #filter out sequences less than minimum length.
					continue
				runningTotal += 1
				fileout = os.path.join(base, BarcodeLabel+'.fastq')
				with open(fileout, 'a') as output:
					output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
				
		if args.require_primer == 'off':   
			amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode')
		elif args.require_primer == 'forward':
			amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode and fwd primer')
		elif args.require_primer == 'both':
			amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode and both primers')
	
		amptklib.log.info("Now Gzipping files")
		for file in os.listdir(base):
			if file.endswith(".fastq"):
				file_path = os.path.join(base, file)
				amptklib.Fzip_inplace(file_path)
	
		#after all files demuxed into output folder, loop through and create SRA metadata file
		filelist = []
		for file in os.listdir(base):
			if file.endswith(".fastq.gz"):
				filelist.append(file)

	amptklib.log.info("Finished: output in %s" % base)
	#clean up if gzipped
	if args.FASTQ.endswith('.gz'):
		amptklib.removefile(FASTQ_IN)

	#check for BioSample meta file
	if args.biosample:
		amptklib.log.info("NCBI BioSample file detected, creating SRA metadata file") 
		#load in BioSample file to dictionary
		with open(args.biosample, 'r') as input:
			reader = csv.reader(input, delimiter=str('\t'))
			header = next(reader)
			acc = header.index('Accession')
			sample = header.index('Sample Name')
			bio = header.index('BioProject')     
			try:
				host = header.index('Host')
			except ValueError:
				host = header.index('Organism')
			BioDict = {col[sample]:(col[acc],col[bio],col[host]) for col in reader}
		#set some defaults based on the platform
		header = 'bioproject_accession\tbiosample_accession\tlibrary_ID\ttitle\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tplatform\tinstrument_model\tdesign_description\tfiletype\tfilename\tfilename2\tforward_barcode\treverse_barcode\tforward_primer\treverse_primer\n'
		if args.platform == 'ion':
			sequencer = 'ION_TORRENT'
			model = 'Ion Torrent PGM' 
			lib_layout = 'single'
		elif args.platform == '454':
			sequencer = '_LS454'
			model = '454 GS FLX Titanium'
			lib_layout = 'single'
		elif args.platform == 'illumina':
			sequencer = 'ILLUMINA'
			model = 'Illumina MiSeq'
			lib_layout = 'paired'
		else:
			amptklib.log.error("You specified a platform that is not supported")
			sys.exit(1)
		lib_strategy = 'AMPLICON'
		lib_source = 'GENOMIC'
		lib_selection = 'RANDOM PCR'
		filetype = 'fastq'
	
		#now open file for writing, input header and then loop through samples
		sub_out = base + '.submission.txt'
		with open(sub_out, 'w') as output:
			output.write(header)
			for file in filelist:
				barcode_for = ''
				barcode_rev = ''
				if not args.description:
					description = '%s amplicon library was created using a barcoded fusion primer PCR protocol using Pfx50 polymerase (Thermo Fisher Scientific), size selected, and sequenced on the %s platform.  Sequence data was minimally processed, sequences were exported directly from the sequencing platform and only the barcode (index sequence) was trimmed prior to SRA submission. SRA submission generated with AMPtk %s' % (args.title, model, amptkversion.split(' ')[-1])
				else:
					description = args.description
				if args.platform == 'ion' or args.platform == '454': 
					name = file.split(".fastq")[0]
					if not name in BioDict: #lets try to look a bit harder, i.e. split on _ and - and look again
						searchname = name.replace('-', '_')
						searchname = searchname.split('_')[0]
						if not searchname in BioDict: #if still not found, then skip
							continue
					else:
						searchname = name     
					bioproject = BioDict.get(searchname)[1]
					if not bioproject.startswith('PRJNA'):
						bioproject = 'PRJNA'+bioproject
					sample_name = BioDict.get(searchname)[0]
					title = '%s amplicon sequencing of %s: sample %s' % (args.title, BioDict.get(name)[2], name)
					bc_name = file.split(".f")[0]
					if bc_name in Barcodes:
						barcode_for = Barcodes.get(bc_name)
					if bc_name in RevBarcodes:
						barcode_rev = RevBarcodes.get(bc_name)
					if args.append:
						finalname = name+'_'+args.append
						#also need to change the name for output files
						newfile = file.replace(name, finalname)
						os.rename(os.path.join(base, file), os.path.join(base, newfile))
					else:
						finalname = name
						newfile = file
					line = [bioproject,sample_name,finalname,title,lib_strategy,lib_source,lib_selection,lib_layout,sequencer,model,description,filetype,newfile,'',barcode_for,barcode_rev,FwdPrimer,RevPrimer]
				elif args.platform == 'illumina':
					name = file.split("_")[0]
					if not name in BioDict:
						amptklib.log.info('{:} not found in BioSample text file'.format(name))
						continue
					bioproject = BioDict.get(name)[1]
					if not bioproject.startswith('PRJNA'):
						bioproject = 'PRJNA'+bioproject
					sample_name = BioDict.get(name)[0]
					title = '%s amplicon sequencing of %s: sample %s' % (args.title, BioDict.get(name)[2], name)   
					file2 = file.replace('_R1', '_R2')             
					#count number of _ in name, determines the dataformat
					fields = file.count("_")
					if fields > 3: #this is full illumina name with dual barcodes
						dualBC = file.split("_")[1]
						if '-' in dualBC:
							barcode_for = dualBC.split('-')[0]
							barcode_rev = dualBC.split('-')[1]
					elif fields == 3: #this is older reverse barcoded name
						barcode_for = ''
						barcode_rev = file.split("_")[1]
					if args.append:
						finalname = name+'_'+args.append
						newfile = file.replace(name, finalname)
						newfile2 = file2.replace(name, finalname)
						#also need to change the name for output files
						os.rename(os.path.join(base, file), os.path.join(base, newfile1))
						os.rename(os.path.join(base, file2), os.path.join(base, newfile2))
						file = file.replace(name, finalname)
					else:
						finalname = name
						newfile = file
						newfile2 = file2
					line = [bioproject,sample_name,finalname,title,lib_strategy,lib_source,lib_selection,lib_layout,sequencer,model,description,filetype,newfile,newfile2,barcode_for,barcode_rev,FwdPrimer,RevPrimer]
				#write output to file
				output.write('\t'.join(line)+'\n')
		amptklib.log.info("SRA submission file created: %s" % sub_out)
Beispiel #6
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-assign_taxonomy.py',
        usage="%(prog)s [options] -f <FASTA File>",
        description='''assign taxonomy to OTUs''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--otu_table',
                        dest="otu_table",
                        help='Append Taxonomy to OTU table')
    parser.add_argument('-f', '--fasta', required=True, help='FASTA input')
    parser.add_argument('-o', '--out', help='Output file (FASTA)')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument(
        '--method',
        default='hybrid',
        choices=['utax', 'usearch', 'sintax', 'hybrid', 'rdp', 'blast'],
        help='Taxonomy method')
    parser.add_argument(
        '-d',
        '--db',
        help='Pre-installed Databases: [ITS,ITS1,ITS2,16S,LSU,COI]')
    parser.add_argument(
        '-t',
        '--taxonomy',
        help='Incorporate taxonomy calculated elsewhere, 2 column file')
    parser.add_argument('--fasta_db',
                        help='Alternative database of fasta sequences')
    parser.add_argument('--add2db',
                        help='Custom FASTA database to add to DB on the fly')
    parser.add_argument('--utax_db', help='UTAX Reference Database')
    parser.add_argument('--utax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='UTAX confidence value threshold.')
    parser.add_argument('--usearch_db', help='USEARCH Reference Database')
    parser.add_argument('--usearch_cutoff',
                        default=0.7,
                        type=restricted_float,
                        help='USEARCH percent ID threshold.')
    parser.add_argument(
        '-r',
        '--rdp',
        dest='rdp',
        default='/Users/jon/scripts/rdp_classifier_2.10.1/dist/classifier.jar',
        help='Path to RDP Classifier')
    parser.add_argument('--rdp_db',
                        dest='rdp_tax',
                        default='fungalits_unite',
                        choices=[
                            '16srrna', 'fungallsu', 'fungalits_warcup',
                            'fungalits_unite'
                        ],
                        help='Training set for RDP Classifier')
    parser.add_argument('--rdp_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='RDP confidence value threshold')
    parser.add_argument('--local_blast', help='Path to local Blast DB')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH8 EXE')
    parser.add_argument('--tax_filter',
                        help='Retain only OTUs with match in OTU table')
    parser.add_argument('--sintax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='SINTAX threshold.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    if not args.out:
        #get base name of files
        if 'filtered' in args.fasta:
            base = args.fasta.split(".filtered")[0]
        elif 'otu' in args.fasta:
            base = args.fasta.split('.otu')[0]
        else:
            base = args.fasta.split('.fa')[0]
    else:
        base = args.out

    #remove logfile if exists
    log_name = base + '.amptk-taxonomy.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #Setup DB locations and names, etc
    DBdir = os.path.join(parentdir, 'DB')
    DataBase = {
        'ITS1': (os.path.join(DBdir,
                              'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb'),
                 os.path.join(DBdir, 'ITS_SINTAX.udb')),
        'ITS2': (os.path.join(DBdir,
                              'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb'),
                 os.path.join(DBdir, 'ITS_SINTAX.udb')),
        'ITS': (os.path.join(DBdir,
                             'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb'),
                os.path.join(DBdir, 'ITS_SINTAX.udb')),
        '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb'),
                os.path.join(DBdir, '16S_SINTAX.udb')),
        'LSU': (os.path.join(DBdir,
                             'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb'),
                os.path.join(DBdir, 'LSU_SINTAX.udb')),
        'COI': (os.path.join(DBdir,
                             'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'),
                os.path.join(DBdir, 'COI_SINTAX.udb'))
    }

    #get DB names up front
    if args.db in DataBase:
        utax_db = DataBase.get(args.db)[1]
        usearch_db = DataBase.get(args.db)[0]
        sintax_db = DataBase.get(args.db)[2]
        if not utax_db:
            utax_db = args.utax_db
        if not usearch_db:
            usearch_db = args.usearch_db
    else:
        utax_db = args.utax_db
        usearch_db = args.usearch_db
        if args.fasta_db:
            sintax_db = args.fasta_db
        else:
            sintax_db = args.usearch_db

    if args.method in ['hybrid', 'usearch', 'utax']:
        if not utax_db and not usearch_db and not args.fasta_db:
            amptklib.log.error(
                "You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db"
            )
            sys.exit(1)
        else:  #check that the DB exists
            if args.method == 'usearch' and usearch_db:
                if not amptklib.checkfile(usearch_db):
                    amptklib.log.error(
                        'USEARCH DB not found: {:}'.format(usearch_db))
                    amptklib.log.derror(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)
            if args.method == 'sintax' and sintax_db:
                if not amptklib.checkfile(sintax_db):
                    amptklib.log.error(
                        'SINTAX DB not found: {:}'.format(sintax_db))
                    amptklib.log.derror(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)
            if args.method == 'utax' and utax_db:
                if not amptklib.checkfile(utax_db):
                    amptklib.log.error(
                        'UTAX DB not found: {:}'.format(utax_db))
                    amptklib.log.error(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)

    custom_db = None
    if args.add2db:  #means user wants to add sequences to the usearch database on the so will need to rebuild database
        custom_db = base + '.custom_database.fa'
        if amptklib.checkfile(custom_db):
            amptklib.SafeRemove(custom_db)
        if args.db:  #this means that the fasta files need to be extracted
            amptklib.log.info("Adding {:} to the {:} database".format(
                os.path.basename(args.add2db), os.path.basename(usearch_db)))
            cmd = ['vsearch', '--udb2fasta', usearch_db, '--output', custom_db]
            amptklib.runSubprocess(cmd, amptklib.log)
            with open(custom_db, 'a') as outfile:
                with open(args.add2db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)
        elif args.fasta_db:
            amptklib.log.info("Adding {:} to the {:} database".format(
                os.path.basename(args.add2db),
                os.path.basename(args.fasta_db)))
            with open(custom_db, 'w') as outfile:
                with open(args.fasta_db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)
                with open(args.add2db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)

    #Count records
    amptklib.log.info("Loading FASTA Records")
    total = amptklib.countfasta(args.fasta)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs')

    #declare output files/variables here
    blast_out = base + '.blast.txt'
    rdp_out = base + '.rdp.txt'
    utax_out = base + '.usearch.txt'
    usearch_out = base + '.usearch.txt'
    sintax_out = base + '.sintax.txt'
    otuDict = {}

    if not args.taxonomy:
        #start with less common uses, i.e. Blast, rdp
        if args.method == 'blast':
            #check if command line blast installed
            if not amptklib.which('blastn'):
                amptklib.log.error("BLASTN not found in your PATH, exiting.")
                sys.exit(1)

            #now run blast remotely using NCBI nt database
            outformat = "6 qseqid sseqid pident stitle"
            if args.local_blast:
                #get number of cpus
                amptklib.log.info("Running local BLAST using db: %s" %
                                  args.local_blast)
                cmd = [
                    'blastn', '-num_threads',
                    str(cpus), '-query', args.fasta, '-db',
                    os.path.abspath(args.local_blast), '-max_target_seqs', '1',
                    '-outfmt', outformat, '-out', blast_out
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                amptklib.log.info(
                    "Running BLASTN using NCBI remote nt database, this may take awhile"
                )
                cmd = [
                    'blastn', '-query', args.fasta, '-db', 'nt', '-remote',
                    '-max_target_seqs', '1', '-outfmt', outformat, '-out',
                    blast_out
                ]
                amptklib.runSubprocess(cmd, amptklib.log)

            #load results and reformat
            new = []
            f = csv.reader(open(blast_out), delimiter=str('\t'))
            for col in f:
                query = col[0]
                gbID = col[1].split("|")[3]
                pident = col[2]
                name = col[3]
                tax = gbID + ";" + name + " (" + pident + ")"
                line = [query, tax]
                new.append(line)
            otuDict = dict(new)
        elif args.method == 'rdp':
            #check that classifier is installed
            try:
                rdp_test = subprocess.Popen(
                    ['java', '-Xmx2000m', '-jar', args.rdp, 'classify'],
                    stdout=subprocess.PIPE).communicate()[0].rstrip()
            except OSError:
                amptklib.log.error("%s not found in your PATH, exiting." %
                                   args.rdp)
                sys.exit(1)

            #RDP database
            amptklib.log.info("Using RDP classifier %s training set" %
                              args.rdp_tax)

            #run RDP
            cmd = [
                'java', '-Xmx2000m', '-jar', args.rdp, 'classify', '-g',
                args.rdp_tax, '-o', rdp_out, '-f', 'fixrank', args.fasta
            ]
            amptklib.runSubprocess(cmd, amptklib.log)

            #load in results and put into dictionary
            new = []
            removal = ["unidentified", "Incertae", "uncultured", "incertae"]
            remove_exp = [re.compile(x) for x in removal]
            f = csv.reader(open(rdp_out), delimiter=str('\t'))
            for col in f:
                if float(col[19]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11] + ",f:" + col[14] + ",g:" + col[17]
                elif float(col[16]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11] + ",f:" + col[14]
                elif float(col[13]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11]
                elif float(col[10]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[8]
                elif float(col[7]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5]
                elif float(col[4]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2]
                else:
                    tax = "RDP;k:unclassified"
                tax_split = tax.split(",")
                tax = [
                    s for s in tax_split
                    if not any(re.search(s) for re in remove_exp)
                ]
                tax = ",".join(tax)
                line = [col[0], tax]
                new.append(line)
            otuDict = dict(new)
        else:
            #check status of USEARCH DB and run
            if args.method in ['hybrid', 'usearch']:
                if args.fasta_db:
                    #now run through usearch global
                    amptklib.log.info(
                        "Global alignment OTUs with usearch_global (VSEARCH) against {:}"
                        .format(os.path.basename(args.fasta_db)))
                    cmd = [
                        'vsearch', '--usearch_global', args.fasta, '--db',
                        os.path.abspath(args.fasta_db), '--userout',
                        usearch_out, '--id',
                        str(args.usearch_cutoff), '--strand', 'both',
                        '--output_no_hits', '--maxaccepts', '0',
                        '--top_hits_only', '--userfields', 'query+target+id',
                        '--notrunclabels', '--threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                elif custom_db:
                    #now run through usearch global
                    amptklib.log.info(
                        "Global alignment OTUs with usearch_global (VSEARCH) against custom DB"
                    )
                    cmd = [
                        'vsearch', '--usearch_global', args.fasta, '--db',
                        os.path.abspath(custom_db), '--userout', usearch_out,
                        '--id',
                        str(args.usearch_cutoff), '--strand', 'both',
                        '--output_no_hits', '--maxaccepts', '0',
                        '--top_hits_only', '--userfields', 'query+target+id',
                        '--notrunclabels', '--threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                else:
                    if usearch_db:
                        amptklib.log.info(
                            "Global alignment OTUs with usearch_global (VSEARCH) against {:}"
                            .format(os.path.basename(usearch_db)))
                        cmd = [
                            'vsearch', '--usearch_global', args.fasta, '--db',
                            os.path.abspath(usearch_db), '--userout',
                            usearch_out, '--id',
                            str(args.usearch_cutoff), '--strand', 'both',
                            '--output_no_hits', '--maxaccepts', '0',
                            '--top_hits_only', '--userfields',
                            'query+target+id', '--notrunclabels', '--threads',
                            str(cpus)
                        ]
                        amptklib.runSubprocess(cmd, amptklib.log)

            if args.method in ['hybrid', 'utax']:
                if utax_db:
                    #now run through UTAX
                    utax_out = base + '.utax.txt'
                    amptklib.log.info("Classifying OTUs with UTAX (USEARCH)")
                    cutoff = str(args.utax_cutoff)
                    cmd = [
                        usearch, '-utax', args.fasta, '-db', utax_db,
                        '-utaxout', utax_out, '-utax_cutoff', cutoff,
                        '-strand', 'plus', '-notrunclabels', '-threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                else:
                    amptklib.log.error("UTAX DB %s not found, skipping" %
                                       utax_db)

            if args.method in ['hybrid', 'sintax']:
                if args.fasta_db:  #if you pass fasta file here, over ride any auto detection
                    sintax_db = args.fasta_db
                #now run sintax
                amptklib.log.info("Classifying OTUs with SINTAX (USEARCH)")
                cmd = [
                    usearch, '-sintax', args.fasta, '-db',
                    os.path.abspath(sintax_db), '-tabbedout', sintax_out,
                    '-sintax_cutoff',
                    str(args.sintax_cutoff), '-strand', 'both', '-threads',
                    str(cpus)
                ]
                amptklib.runSubprocess(cmd, amptklib.log)

            #now process results, load into dictionary - slightly different depending on which classification was run.
            if args.method == 'hybrid':
                #run upgraded method, first load dictionaries with resuls
                if amptklib.checkfile(utax_out):
                    utaxDict = amptklib.classifier2dict(
                        utax_out, args.utax_cutoff)
                    amptklib.log.debug(
                        'UTAX results parsed, resulting in {:,} taxonomy predictions'
                        .format(len(utaxDict)))
                else:
                    amptklib.log.info('UTAX results empty')
                    utaxDict = {}
                if amptklib.checkfile(sintax_out):
                    sintaxDict = amptklib.classifier2dict(
                        sintax_out, args.sintax_cutoff)
                    amptklib.log.debug(
                        'SINTAX results parsed, resulting in {:,} taxonomy predictions'
                        .format(len(sintaxDict)))
                else:
                    amptklib.log.info('SINTAX results empty')
                    sintaxDict = {}
                usearchDict = amptklib.usearchglobal2dict(usearch_out)
                amptklib.log.debug(
                    'Global alignment results parsed, resulting in {:,} taxonomy predictions'
                    .format(len(usearchDict)))
                otuList = natsorted(list(usearchDict.keys()))
                #first compare classifier results, getting better of the two
                bestClassify = amptklib.bestclassifier(utaxDict, sintaxDict,
                                                       otuList)
                #now get best taxonomy by comparing to global alignment results
                otuDict = amptklib.bestTaxonomy(usearchDict, bestClassify)
                amptklib.log.debug(
                    'Combined OTU taxonomy dictionary contains {:,} taxonomy predictions'
                    .format(len(otuDict)))
                if len(otuDict) < 1:
                    amptklib.log.info('Parsing taxonomy failed -- see logfile')
                    sys.exit(1)

            elif args.method == 'utax' and amptklib.checkfile(utax_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug("Loading UTAX results into dictionary")
                with open(utax_out, 'r') as infile:
                    reader = csv.reader(infile, delimiter=str("\t"))
                    otuDict = {rows[0]: 'UTAX;' + rows[2] for rows in reader}

            elif args.method == 'usearch' and amptklib.checkfile(usearch_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug(
                    "Loading Global Alignment results into dictionary")
                otuDict = {}
                usearchDict = amptklib.usearchglobal2dict(usearch_out)
                for k, v in natsorted(list(usearchDict.items())):
                    pident = float(v[0]) * 100
                    pident = "{0:.1f}".format(pident)
                    ID = v[1]
                    tax = ','.join(v[-1])
                    LCA = v[2]
                    if LCA == '':
                        fulltax = 'GS|' + pident + '|' + ID + ';' + tax
                    else:
                        fulltax = 'GSL|' + pident + '|' + ID + ';' + tax
                    otuDict[k] = fulltax

            elif args.method == 'sintax' and amptklib.checkfile(sintax_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug("Loading SINTAX results into dictionary")
                with open(sintax_out, 'r') as infile:
                    reader = csv.reader(infile, delimiter=(str("\t")))
                    otuDict = {rows[0]: 'SINTAX;' + rows[3] for rows in reader}
    else:
        #you have supplied a two column taxonomy file, parse and build otuDict
        amptklib.log.debug("Loading custom Taxonomy into dictionary")
        with open(args.taxonomy, 'r') as infile:
            reader = csv.reader(infile, delimiter=str("\t"))
            otuDict = {rows[0]: rows[1] for rows in reader}

    #now format results
    if args.otu_table:
        #check if otu_table variable is empty, then load in otu table
        amptklib.log.info("Appending taxonomy to OTU table and OTUs")
        taxTable = base + '.otu_table.taxonomy.txt'
        tmpTable = base + '.otu_table.tmp'

        #append to OTU table
        counts = 0
        with open(taxTable, 'w') as outTable:
            with open(args.otu_table, 'r') as inTable:
                #guess the delimiter format
                firstline = inTable.readline()
                dialect = amptklib.guess_csv_dialect(firstline)
                inTable.seek(0)
                #parse OTU table
                reader = csv.reader(inTable, dialect)
                for line in reader:
                    if line[0].startswith(("#OTU", "OTUId")):
                        line.append('Taxonomy')
                    else:
                        tax = otuDict.get(line[0]) or "No Hit"
                        line.append(tax)
                    if args.tax_filter and not args.method == 'blast':
                        if line[0].startswith(("#OTU", "OTUId")):
                            join_line = ('\t'.join(str(x) for x in line))
                        else:
                            if args.tax_filter in line[-1]:
                                join_line = ('\t'.join(str(x) for x in line))
                                counts += 1
                            else:
                                continue
                    else:
                        join_line = ('\t'.join(str(x) for x in line))
                        counts += 1
                    outTable.write("%s\n" % join_line)

        if args.tax_filter:
            if args.method == 'blast':
                amptklib.log.info(
                    "Blast is incompatible with --tax_filter, use a different method"
                )
                tmpTable = args.otu_table
            else:
                nonfungal = total - counts
                amptklib.log.info(
                    "Found %i OTUs not matching %s, writing %i %s hits to taxonomy OTU table"
                    % (nonfungal, args.tax_filter, counts, args.tax_filter))
                #need to create a filtered table without taxonomy for BIOM output
                with open(tmpTable, 'w') as output:
                    with open(taxTable, 'r') as input:
                        firstline = input.readline()
                        dialect = amptklib.guess_csv_dialect(firstline)
                        input.seek(0)
                        #parse OTU table
                        reader = csv.reader(input, dialect)
                        for line in reader:
                            del line[-1]
                            join_line = '\t'.join(str(x) for x in line)
                            output.write("%s\n" % join_line)
        else:
            tmpTable = args.otu_table

    #append to OTUs
    otuTax = base + '.otus.taxonomy.fa'
    with open(otuTax, 'w') as output:
        with open(args.fasta, 'r') as input:
            SeqRecords = SeqIO.parse(input, 'fasta')
            for rec in SeqRecords:
                tax = otuDict.get(rec.id) or "No hit"
                rec.description = tax
                SeqIO.write(rec, output, 'fasta')

    if not args.taxonomy:
        #output final taxonomy in two-column format, followed by the hits for usearch/sintax/utax if hybrid is used.
        taxFinal = base + '.taxonomy.txt'
        with open(taxFinal, 'w') as finaltax:
            if args.method == 'hybrid':
                finaltax.write('#OTUID\ttaxonomy\tUSEARCH\tSINTAX\tUTAX\n')
                for k, v in natsorted(list(otuDict.items())):
                    if k in usearchDict:
                        usearchResult = usearchDict.get(k)
                        usearchResult = ','.join(usearchResult[-1])
                    else:
                        usearchResult = 'No hit'
                    if k in sintaxDict:
                        sintaxResult = sintaxDict.get(k)
                        sintaxResult = ','.join(sintaxResult[-1])
                    else:
                        sintaxResult = 'No hit'
                    if k in utaxDict:
                        utaxResult = utaxDict.get(k)
                        utaxResult = ','.join(utaxResult[-1])
                    else:
                        utaxResult = 'No hit'
                    finaltax.write('{:}\t{:}\t{:}\t{:}\t{:}\n'.format(
                        k, v, usearchResult, sintaxResult, utaxResult))
            else:
                finaltax.write('#OTUID\ttaxonomy\n')
                for k, v in natsorted(list(otuDict.items())):
                    finaltax.write('%s\t%s\n' % (k, v))
    else:
        taxFinal = args.taxonomy
    #convert taxonomy to qiime format for biom
    qiimeTax = None
    if not args.method == 'blast':
        qiimeTax = base + '.qiime.taxonomy.txt'
        amptklib.utax2qiime(taxFinal, qiimeTax)
    else:
        amptklib.log.error(
            "Blast taxonomy is not compatible with BIOM output, use a different method"
        )

    #create OTU phylogeny for downstream processes
    amptklib.log.info("Generating phylogenetic tree")
    tree_out = base + '.tree.phy'
    cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out]
    amptklib.runSubprocess(cmd, amptklib.log)

    #print some summary file locations
    amptklib.log.info("Taxonomy finished: %s" % taxFinal)
    if args.otu_table and not args.method == 'blast':
        amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable)
        #output final OTU table in Biom v1.0 (i.e. json format if biom installed)
        outBiom = base + '.biom'
        if amptklib.which('biom'):
            amptklib.removefile(outBiom)
            cmd = [
                'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp',
                '--table-type', "OTU table", '--to-json'
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            if args.mapping_file:
                mapSamples = []
                repeatSamples = []
                with open(args.mapping_file, 'r') as mapin:
                    for line in mapin:
                        line = line.rstrip()
                        if line.startswith('#'):
                            continue
                        sampleID = line.split('\t')[0]
                        if not sampleID in mapSamples:
                            mapSamples.append(sampleID)
                        else:
                            repeatSamples.append(sampleID)
                otuSamples = []
                with open(tmpTable, 'r') as otuin:
                    for line in otuin:
                        line = line.rstrip()
                        if line.startswith('#'):
                            otuSamples = line.split('\t')[1:]
                missingMap = []
                for otu in otuSamples:
                    if not otu in mapSamples:
                        missingMap.append(otu)
                if len(missingMap) > 0:
                    amptklib.log.error(
                        "%s are missing from mapping file (metadata), skipping biom file creation"
                        % ', '.join(missingMap))
                elif len(repeatSamples) > 0:
                    amptklib.log.error(
                        '%s duplicate sample IDs in mapping file, skipping biom file creation'
                        % ', '.join(repeatSamples))
                else:
                    if qiimeTax:
                        cmd = [
                            'biom', 'add-metadata', '-i', outBiom + '.tmp',
                            '-o', outBiom, '--observation-metadata-fp',
                            qiimeTax, '-m', args.mapping_file,
                            '--sc-separated', 'taxonomy', '--output-as-json'
                        ]
                    else:
                        cmd = [
                            'biom', 'add-metadata', '-i', outBiom + '.tmp',
                            '-o', outBiom, '-m', args.mapping_file,
                            '--output-as-json'
                        ]
                    amptklib.runSubprocess(cmd, amptklib.log)
            else:
                cmd = [
                    'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o',
                    outBiom, '--observation-metadata-fp', qiimeTax,
                    '--sc-separated', 'taxonomy', '--output-as-json'
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            amptklib.removefile(outBiom + '.tmp')
            amptklib.log.info("BIOM OTU table created: %s" % outBiom)
        else:
            amptklib.log.info(
                "biom program not installed, install via `pip install biom-format` or `conda install biom-format`"
            )
    amptklib.log.info("OTUs with taxonomy: %s" % otuTax)
    amptklib.log.info("OTU phylogeny: %s" % tree_out)

    #clean up intermediate files
    if not args.debug:
        for i in [
                utax_out, usearch_out, sintax_out, qiimeTax,
                base + '.otu_table.tmp'
        ]:
            if i:
                amptklib.removefile(i)
    print("-------------------------------------------------------")
Beispiel #7
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-stats.py',
        description=
        '''Script takes BIOM as input and runs basic summary stats''',
        epilog="""Written by Jon Palmer (2017) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--biom',
                        required=True,
                        help='Input BIOM file (OTU table + metadata)')
    parser.add_argument('-t',
                        '--tree',
                        required=True,
                        help='Phylogentic tree from AMPtk taxonomy')
    parser.add_argument('-o',
                        '--out',
                        default='amptk_stats',
                        help='Output folder basename')
    parser.add_argument('-d',
                        '--distance',
                        default='raupcrick',
                        choices=[
                            'raupcrick', 'bray', 'unifrac', 'wunifrac',
                            'jaccard', 'aitchison', 'all'
                        ],
                        help="Distance metric")
    parser.add_argument('--indicator_species',
                        action='store_true',
                        help='Run indicator species analysis')
    parser.add_argument('--ignore_otus',
                        nargs="+",
                        help='OTUs to drop from table and run stats')
    parser.add_argument(
        '--ord_method',
        default='NMDS',
        choices=["DCA", "CCA", "RDA", "DPCoA", "NMDS", "MDS", "PCoA"],
        help='Ordination method')
    parser.add_argument(
        '--ord_ellipse',
        action='store_true',
        help='Add ellipses on NMDS instead of centroids & error bars')
    #parser.add_argument('-t','--treatments', nargs='+', help='treatments (metadata variables) to run, Default: all')
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    phyloseq_nmds = os.path.join(parentdir, 'phyloseq_nmds.R')
    parse_adonis = os.path.join(parentdir, 'parse_adonis.py')
    phyloseq_nmds_indicator = os.path.join(parentdir,
                                           'phyloseq_nmds_indicator.R')
    if args.indicator_species:
        phyloseqCMD = phyloseq_nmds_indicator
    else:
        phyloseqCMD = phyloseq_nmds

    #remove logfile if exists
    log_name = args.out + '.amptk-stats.log'
    if os.path.isfile(log_name):
        amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    #initialize script, log system info and usearch version
    amptklib.SystemInfo()

    #check dependencies
    programs = ['Rscript']
    amptklib.CheckDependencies(programs)
    Rversions = amptklib.checkRversion()
    R_pass = '******'
    phyloseq_pass = '******'
    #check dada2 first, if good move on, otherwise issue warning
    if not amptklib.gvc(Rversions[2], phyloseq_pass):
        amptklib.log.error("R v%s; Phyloseq v%s detected, need atleast v%s" %
                           (Rversions[0], Rversions[2], phyloseq_pass))
        amptklib.log.error(
            "See: https://joey711.github.io/phyloseq/index.html")
        sys.exit(1)
    amptklib.log.info("R v%s; Phyloseq v%s" % (Rversions[0], Rversions[2]))

    #this is a simple wrapper for an R script so easier to run from amptk menu
    if not os.path.isdir(args.out):
        os.makedirs(args.out)

    phylolog = os.path.join(args.out, 'phyloseq-R.log')
    if args.distance == 'all':
        distances = ['raupcrick', 'bray', 'unifrac', 'wunifrac', 'jaccard']
        amptklib.log.info(
            "Running hypothesis test using %s distance metrics on all treatments, drawing %s for each."
            % (','.join(distances), args.ord_method))
        for dist in distances:
            cmd = [
                'Rscript', '--vanilla', phyloseqCMD,
                os.path.abspath(args.biom),
                os.path.abspath(args.tree), args.out, dist, args.ord_method,
                str(args.ord_ellipse)
            ]
            if args.ignore_otus:
                cmd = cmd + args.ignore_otus
            amptklib.runSubprocess3(cmd, amptklib.log, args.out, phylolog)
    else:
        amptklib.log.info(
            "Running hypothesis test using %s distance metric on all treatments, drawing NMDS for each."
            % args.distance)
        cmd = [
            'Rscript', '--vanilla', phyloseqCMD,
            os.path.abspath(args.biom),
            os.path.abspath(args.tree), args.out, args.distance,
            args.ord_method,
            str(args.ord_ellipse)
        ]
        if args.ignore_otus:
            cmd = cmd + args.ignore_otus
        amptklib.runSubprocess3(cmd, amptklib.log, args.out, phylolog)

    #parse the adonis output
    #amptklib.log.info("Parsing p-values from hyopthesis tests generated in R")
    #subprocess.call([parse_adonis, args.out])
    amptklib.log.info(
        'HTML output files were generated for each treatment: {:}'.format(
            args.out))
    print("-------------------------------------------------------")
Beispiel #8
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-dada2.py',
        description=
        '''Script takes output from amptk pre-processing and runs DADA2''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        required=True,
                        help='Input Demuxed containing FASTQ')
    parser.add_argument('-o', '--out', help='Output Basename')
    parser.add_argument(
        '-m',
        '--min_reads',
        default=10,
        type=int,
        help="Minimum number of reads after Q filtering to run DADA2 on")
    parser.add_argument('-l',
                        '--length',
                        type=int,
                        help='Length to truncate reads')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='MaxEE quality filtering')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="Biological OTU Clustering Percent")
    parser.add_argument('--platform',
                        default='ion',
                        choices=['ion', 'illumina', '454'],
                        help='Sequencing platform')
    parser.add_argument('--chimera_method',
                        default='consensus',
                        choices=['consensus', 'pooled', 'per-sample'],
                        help='bimera removal method')
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--pool',
                        action='store_true',
                        help='Pool all sequences together for DADA2')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep all intermediate files')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))
    dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R')

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.fastq:
            base = os.path.basename(args.fastq).split('.demux')[0]
        else:
            base = os.path.basename(args.fastq).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-dada2.log'
    if os.path.isfile(log_name):
        amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cores
    if args.cpus:
        CORES = str(args.cpus)
    else:
        CORES = str(amptklib.getCPUS())

    #check dependencies
    programs = ['Rscript']
    amptklib.CheckDependencies(programs)
    Rversions = amptklib.checkRversion()
    R_pass = '******'
    dada2_pass = '******'

    #check dada2 first, if good move on, otherwise issue warning
    if not amptklib.gvc(Rversions[1], dada2_pass):
        amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" %
                           (Rversions[0], Rversions[1], dada2_pass))
        amptklib.log.error(
            "See: http://benjjneb.github.io/dada2/dada-installation.html")
        sys.exit(1)
    amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1]))

    #Count FASTQ records and remove 3' N's as dada2 can't handle them
    amptklib.log.info("Loading FASTQ Records")
    no_ns = base + '.cleaned_input.fq'
    if args.fastq.endswith('.gz'):
        fastqInput = args.fastq.replace('.gz', '')
        amptklib.Funzip(os.path.abspath(args.fastq),
                        os.path.basename(fastqInput), CORES)
    else:
        fastqInput = os.path.abspath(args.fastq)
    amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns)
    demuxtmp = base + '.original.fa'
    cmd = [
        'vsearch', '--fastq_filter',
        os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(demuxtmp)
    size = amptklib.checkfastqsize(no_ns)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #quality filter
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    derep = base + '.qual-filtered.fq'
    filtercmd = [
        'vsearch', '--fastq_filter', no_ns, '--fastq_maxee',
        str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55',
        '--fastq_maxns', '0', '--threads', CORES
    ]
    amptklib.runSubprocess(filtercmd, amptklib.log)
    total = amptklib.countfastq(derep)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #split into individual files
    amptklib.log.info("Splitting FASTQ file by Sample into individual files")
    filtfolder = base + '_filtered'
    if os.path.isdir(filtfolder):
        shutil.rmtree(filtfolder)
    os.makedirs(filtfolder)
    splitDemux2(derep, filtfolder, args=args)

    #check for minimum number of reads in each sample
    remove = []
    files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')]
    for x in files:
        if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads:
            remove.append(x)
    if len(remove) > 0:
        amptklib.log.info("Dropping %s as fewer than %i reads" %
                          (', '.join(remove), args.min_reads))
        for y in remove:
            os.remove(os.path.join(filtfolder, y))

    #now run DADA2 on filtered folder
    amptklib.log.info("Running DADA2 pipeline")
    dada2log = base + '.dada2.Rscript.log'
    dada2out = base + '.dada2.csv'
    #check pooling vs notpooled, default is not pooled.
    if args.pool:
        POOL = 'TRUE'
    else:
        POOL = 'FALSE'
    with open(dada2log, 'w') as logfile:
        subprocess.call([
            'Rscript', '--vanilla', dada2script, filtfolder, dada2out,
            args.platform, POOL, CORES, args.chimera_method
        ],
                        stdout=logfile,
                        stderr=logfile)

    #check for results
    if not os.path.isfile(dada2out):
        amptklib.log.error("DADA2 run failed, please check %s logfile" %
                           dada2log)
        sys.exit(1)

    #now process the output, pull out fasta, rename, etc
    fastaout = base + '.otus.tmp'
    OTUCounts = {}
    counter = 1
    with open(fastaout, 'w') as writefasta:
        with open(dada2out, 'r') as input:
            next(input)
            for line in input:
                line = line.replace('\n', '')
                line = line.replace('"', '')
                cols = line.split(',')
                Seq = cols[0]
                countList = [int(x) for x in cols[1:]]
                counts = sum(countList)
                ID = 'ASV' + str(counter)
                if not ID in OTUCounts:
                    OTUCounts[ID] = counts
                writefasta.write(">%s\n%s\n" % (ID, Seq))
                counter += 1

    #get number of bimeras from logfile
    with open(dada2log, 'r') as bimeracheck:
        for line in bimeracheck:
            if line.startswith('Identified '):
                bimeraline = line.split(' ')
                bimeras = int(bimeraline[1])
                totalSeqs = int(bimeraline[5])
    validSeqs = totalSeqs - bimeras
    amptklib.log.info('{0:,}'.format(totalSeqs) +
                      ' total amplicon sequence variants (ASVs)')
    amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed')
    amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs')

    #optional UCHIME Ref
    uchime_out = base + '.nonchimeras.fa'
    chimeraFreeTable = base + '.otu_table.txt'
    iSeqs = base + '.ASVs.fa'
    if not args.uchime_ref:
        os.rename(fastaout, iSeqs)
    else:
        #check if file is present, remove from previous run if it is.
        if os.path.isfile(iSeqs):
            amptklib.removefile(iSeqs)
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = fastaout
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            if os.path.isfile(args.uchime_ref):
                uchime_db = os.path.abspath(args.uchime_ref)
            else:
                amptklib.log.error(
                    "%s is not a valid file, skipping reference chimera filtering"
                    % args.uchime_ref)
                iSeqs = fastaout
        #now run chimera filtering if all checks out
        if not os.path.isfile(iSeqs):
            amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                              args.uchime_ref)
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db',
                uchime_db, '--nonchimeras', iSeqs, '--threads', CORES
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(iSeqs)
            uchime_chimeras = validSeqs - total
            amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' +
                              '{0:,}'.format(uchime_chimeras) +
                              ' ref chimeras removed')
            if os.path.isfile(fastaout):
                amptklib.removefile(fastaout)

    #setup output files
    dadademux = base + '.dada2.map.uc'
    bioSeqs = base + '.cluster.otus.fa'
    bioTable = base + '.cluster.otu_table.txt'
    uctmp = base + '.map.uc'
    ClusterComp = base + '.ASVs2clusters.txt'

    #Filter out ASVs in wrong orientation
    amptklib.log.info('Validating ASV orientation')
    os.rename(iSeqs, iSeqs + '.bak')
    numKept, numDropped = amptklib.validateorientationDADA2(
        OTUCounts, iSeqs + '.bak', iSeqs)
    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(
        numKept, numDropped))
    amptklib.SafeRemove(iSeqs + '.bak')

    #map reads to DADA2 OTUs
    amptklib.log.info("Mapping reads to DADA2 ASVs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97',
        '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(dadademux)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #cluster
    amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" %
                      args.pct_otu)
    radius = float(args.pct_otu) / 100.
    cmd = [
        'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id',
        str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none',
        '--usersort', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(bioSeqs)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')

    #determine where iSeqs clustered
    iSeqmap = base + '.ASV_map.uc'
    cmd = [
        'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id',
        str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    iSeqMapped = {}
    with open(iSeqmap, 'r') as mapping:
        for line in mapping:
            line = line.replace('\n', '')
            cols = line.split('\t')
            OTU = cols[9]
            Hit = cols[8]
            if not OTU in iSeqMapped:
                iSeqMapped[OTU] = [Hit]
            else:
                iSeqMapped[OTU].append(Hit)
    with open(ClusterComp, 'w') as clusters:
        clusters.write('OTU\tASVs\n')
        for k, v in natsorted(list(iSeqMapped.items())):
            clusters.write('%s\t%s\n' % (k, ', '.join(v)))
    #create OTU table
    amptklib.log.info("Mapping reads to OTUs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id',
        '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(uctmp)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    if not args.debug:
        amptklib.removefile(no_ns)
        shutil.rmtree(filtfolder)
        amptklib.removefile(dada2out)
        amptklib.removefile(derep)
        amptklib.removefile(demuxtmp)
        amptklib.removefile(uctmp)
        amptklib.removefile(iSeqmap)
        amptklib.removefile(dadademux)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("DADA2 Script has Finished Successfully")
    print("-------------------------------------------------------")
    if args.debug:
        print("Tmp Folder of files: %s" % filtfolder)
    print("Amplicon sequence variants: %s" % iSeqs)
    print("ASV OTU Table: %s" % chimeraFreeTable)
    print("Clustered OTUs: %s" % bioSeqs)
    print("OTU Table: %s" % bioTable)
    print("ASVs 2 OTUs: %s" % ClusterComp)
    print("-------------------------------------------------------")

    otu_print = bioSeqs.split('/')[-1]
    tab_print = bioTable.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Beispiel #9
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-OTU_cluster.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UPARSE OTU clustering.
		Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="OTU Clustering Percent")
    parser.add_argument('-m',
                        '--minsize',
                        default='2',
                        help='Min size to keep for clustering')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument('--unoise',
                        action='store_true',
                        help='Run De-noising (UNOISE)')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-cluster.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output',
        derep_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #optional run UNOISE
    if args.unoise:
        unoise_out = unoise_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.denoised.fa')
        amptklib.log.info("Denoising Data with UNOISE")
        cmd = [
            usearch, '-cluster_fast', derep_out, '-centroids', unoise_out,
            '-id', '0.9', '--maxdiffs', '5', '-abskew', '10', '-sizein',
            '-sizeout', '-sort', 'size', '-threads',
            str(cpus)
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        total = amptklib.countfasta(unoise_out)
        amptklib.log.info('{0:,}'.format(total) + ' reads passed')
    else:
        unoise_out = derep_out

    #now sort by size remove singletons
    sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa')
    cmd = [
        'vsearch', '--sortbysize', unoise_out, '--minsize', args.minsize,
        '--output', sort_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #now run clustering algorithm
    radius = str(100 - int(args.pct_otu))
    otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa')
    amptklib.log.info("Clustering OTUs (UPARSE)")
    cmd = [
        usearch, '-cluster_otus', sort_out, '-relabel', 'OTU',
        '-otu_radius_pct', radius, '-otus', otu_out, '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    numOTUs = amptklib.countfasta(otu_out)
    amptklib.log.info('{0:,}'.format(numOTUs) + ' OTUs')

    #clean up padded N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.otus.fa')
    amptklib.fasta_strip_padding(otu_out, otu_clean)

    #optional UCHIME Ref
    if not args.uchime_ref:
        uchime_out = otu_clean
    else:
        uchime_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.uchime.otus.fa')
        #check if file is present, remove from previous run if it is.
        if os.path.isfile(uchime_out):
            os.remove(uchime_out)
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = otu_clean
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            if os.path.isfile(args.uchime_ref):
                uchime_db = os.path.abspath(args.uchime_ref)
            else:
                amptklib.log.error(
                    "%s is not a valid file, skipping reference chimera filtering"
                    % args.uchime_ref)
                uchime_out = otu_clean
        #now run chimera filtering if all checks out
        if not os.path.isfile(uchime_out):
            amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                              args.uchime_ref)
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean,
                '--db', uchime_db, '--nonchimeras', uchime_out, '--threads',
                str(cpus)
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(uchime_out)
            uchime_chimeras = numOTUs - total
            amptklib.log.info('{0:,}'.format(total) + ' OTUs passed, ' +
                              '{0:,}'.format(uchime_chimeras) +
                              ' ref chimeras')

    #Filter out OTUs in wrong orientation
    amptklib.log.info('Validating OTU orientation')
    passingOTUs = os.path.join(tmp, base + '.passed.otus.fa')
    numKept, numDropped = amptklib.validateorientation(tmp, sort_out,
                                                       uchime_out, passingOTUs)
    amptklib.log.info('{:,} OTUs validated ({:,} dropped)'.format(
        numKept, numDropped))

    #now map reads back to OTUs and build OTU table
    uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', passingOTUs, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(passingOTUs, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)
    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("OTU Clustering Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Beispiel #10
0
def main(args):
    global FwdPrimer, RevPrimer, Barcodes, tmpdir, usearch
    parser = argparse.ArgumentParser(
        prog='amptk-process_illumina_raw.py',
        usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu",
        description=
        '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-f',
                        '--forward',
                        dest='fastq',
                        required=True,
                        help='Illumina FASTQ R1 reads')
    parser.add_argument('-r',
                        '--reverse',
                        required=True,
                        help='Illumina FASTQ R2 reads')
    parser.add_argument('-i',
                        '--index',
                        nargs='+',
                        required=True,
                        help='Illumina FASTQ index reads')
    parser.add_argument('-m', '--mapping_file', help='QIIME-like mapping file')
    parser.add_argument('--read_length',
                        type=int,
                        help='Read length, i.e. 2 x 300 bp = 300')
    parser.add_argument('-o',
                        '--out',
                        dest="out",
                        default='illumina_out',
                        help='Base name for output')
    parser.add_argument('--fwd_primer',
                        dest="F_primer",
                        default='515FB',
                        help='Forward Primer')
    parser.add_argument('--rev_primer',
                        dest="R_primer",
                        default='806RB',
                        help='Reverse Primer')
    parser.add_argument('--primer_mismatch',
                        default=2,
                        type=int,
                        help='Number of mis-matches in primer')
    parser.add_argument('--barcode_mismatch',
                        default=0,
                        type=int,
                        help='Number of mis-matches in barcode')
    parser.add_argument(
        '--barcode_fasta',
        help='FASTA file containing Barcodes (Names & Sequences)')
    parser.add_argument('--rescue_forward',
                        default='on',
                        choices=['on', 'off'],
                        help='Rescue Not-merged forward reads')
    parser.add_argument('--barcode_rev_comp',
                        action='store_true',
                        help='Reverse complement barcode sequences')
    parser.add_argument('--min_len',
                        default=100,
                        type=int,
                        help='Minimum read length to keep')
    parser.add_argument('-l',
                        '--trim_len',
                        default=300,
                        type=int,
                        help='Trim length for reads')
    parser.add_argument('-p',
                        '--pad',
                        default='off',
                        choices=['on', 'off'],
                        help='Pad with Ns to a set length')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--cleanup',
                        action='store_true',
                        help='remove intermediate files')
    parser.add_argument('--merge_method',
                        default='usearch',
                        choices=['usearch', 'vsearch'],
                        help='Software to use for PE read merging')
    args = parser.parse_args(args)

    args.out = re.sub(r'\W+', '', args.out)

    log_name = args.out + '.amptk-demux.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #get version of amptk
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of CPUs to use
    if not args.cpus:
        cpus = multiprocessing.cpu_count()
    else:
        cpus = args.cpus

    #create tmpdir
    tmpdir = args.out.split('.')[0] + '_' + str(os.getpid())
    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    #parse a mapping file or a barcode fasta file, primers, etc get setup
    #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
    barcode_file = args.out + ".barcodes_used.fa"
    if os.path.isfile(barcode_file):
        os.remove(barcode_file)

    #check if mapping file passed, use this if present, otherwise use command line arguments
    SampleData = {}
    Barcodes = {}
    RevBarcodes = {}
    FwdPrimer = ''
    RevPrimer = ''
    if args.mapping_file:
        if not os.path.isfile(args.mapping_file):
            amptklib.log.error("Mapping file not found: %s" %
                               args.mapping_file)
            sys.exit(1)
        SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(
            args.mapping_file)
    else:  #no mapping file, so create dictionaries from barcode fasta files
        if not args.barcode_fasta:
            amptklib.log.error(
                "You did not specify a --barcode_fasta or --mapping_file, one is required"
            )
            sys.exit(1)
        else:
            shutil.copyfile(args.barcode_fasta, barcode_file)
            Barcodes = amptklib.fasta2barcodes(barcode_file, False)

    if FwdPrimer == '' or RevPrimer == '':
        #parse primers here so doesn't conflict with mapping primers
        #look up primer db otherwise default to entry
        if args.F_primer in amptklib.primer_db:
            FwdPrimer = amptklib.primer_db.get(args.F_primer)
            amptklib.log.info(
                "{:} fwd primer found in AMPtk primer db, setting to: {:}".
                format(args.F_primer, FwdPrimer))
        else:
            FwdPrimer = args.F_primer
            amptklib.log.info(
                "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.F_primer))
        if args.R_primer in amptklib.primer_db:
            RevPrimer = amptklib.primer_db.get(args.R_primer)
            amptklib.log.info(
                "{:} rev primer found in AMPtk primer db, setting to: {:}".
                format(args.R_primer, RevPrimer))
        else:
            RevPrimer = args.R_primer
            amptklib.log.info(
                "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.R_primer))

    #if still no primers set, then exit
    if FwdPrimer == '' or RevPrimer == '':
        amptklib.log.error(
            "Please provide primer sequences via --fwd_primer and --rev_primer"
        )
        sys.exit(1)

    #if barcodes_rev_comp passed then reverse complement the keys in mapdict
    if args.barcode_rev_comp:
        amptklib.log.info("Reverse complementing barcode sequences")
        backupDict = Barcodes
        Barcodes = {}
        for k, v in list(backupDict.items()):
            RCkey = amptklib.RevComp(v)
            Barcodes[k] = RCkey

    amptklib.log.info("Loading %i samples from mapping file" % len(Barcodes))
    amptklib.log.info('FwdPrimer: {:}  RevPrimer: {:}'.format(
        FwdPrimer, RevPrimer))
    amptklib.log.info(
        'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'
        .format(args.min_len, args.trim_len))

    #rename reads according to indexes
    if not amptklib.PEandIndexCheck(
            args.fastq, args.reverse,
            args.index[0]):  #check they are all same length
        amptklib.log.error("FASTQ input malformed, read numbers do not match")
        sys.exit(1)
    amptklib.log.info("Loading FASTQ Records")
    NumSeqs = amptklib.countfastq(args.fastq)
    if cpus > 1:
        amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus))
        amptklib.split_fastqPEandI(args.fastq, args.reverse, args.index[0],
                                   NumSeqs, tmpdir, cpus * 2)
        file_list = []
        for file in os.listdir(tmpdir):
            if file.endswith('.fq'):
                filepart = os.path.join(tmpdir, file.split('_R')[0])
                if not filepart in file_list:
                    file_list.append(filepart)

        amptklib.log.info("Mapping indexes to reads and renaming PE reads")
        amptklib.runMultiProgress(safe_run, file_list, cpus, args=args)
    else:
        amptklib.log.info("Mapping indexes to reads and renaming PE reads")
        shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq'))
        shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq'))
        shutil.copyfile(args.index[0], os.path.join(tmpdir, 'chunk_R3.fq'))
        processReadsPE(os.path.join(tmpdir, 'chunk'), args=args)

    print("-------------------------------------------------------")
    #Now concatenate all of the demuxed files together
    amptklib.log.info("Concatenating Demuxed Files")

    tmpDemux = os.path.join(tmpdir, args.out + '.demux.fq')
    with open(tmpDemux, 'wb') as outfile:
        for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')):
            if filename == tmpDemux:
                continue
            with open(filename, 'r') as readfile:
                shutil.copyfileobj(readfile, outfile)
    #parse the stats
    finalstats = [0, 0, 0, 0, 0, 0]
    for file in os.listdir(tmpdir):
        if file.endswith('.stats'):
            with open(os.path.join(tmpdir, file), 'r') as statsfile:
                line = statsfile.readline()
                line = line.replace('\n', '')
                newstats = line.split(',')
                newstats = [int(i) for i in newstats]
                for x, num in enumerate(newstats):
                    finalstats[x] += num

    #finally reindex output
    #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split
    Demux = args.out + '.demux.fq'
    amptklib.fastqreindex(tmpDemux, Demux)
    amptklib.SafeRemove(tmpDemux)

    #output stats of the run
    amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads')
    amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) +
                      ' discarded no index match')
    amptklib.log.info('{0:,}'.format(finalstats[2]) +
                      ' Fwd Primer found, {0:,}'.format(finalstats[3]) +
                      ' Rev Primer found')
    amptklib.log.info('{0:,}'.format(finalstats[4]) +
                      ' discarded too short (< %i bp)' % args.min_len)
    amptklib.log.info('{0:,}'.format(finalstats[5]) + ' valid output reads')

    #now loop through data and find barcoded samples, counting each.....
    BarcodeCount = {}
    with open(Demux, 'r') as input:
        header = itertools.islice(input, 0, None, 4)
        for line in header:
            ID = line.split("=", 1)[-1].split(";")[0]
            if ID not in BarcodeCount:
                BarcodeCount[ID] = 1
            else:
                BarcodeCount[ID] += 1

    #now let's count the barcodes found and count the number of times they are found.
    barcode_counts = "%30s:  %s" % ('Sample', 'Count')
    for k, v in natsorted(list(BarcodeCount.items()),
                          key=lambda k_v: k_v[1],
                          reverse=True):
        barcode_counts += "\n%30s:  %s" % (k, str(BarcodeCount[k]))
    amptklib.log.info("Found %i barcoded samples\n%s" %
                      (len(BarcodeCount), barcode_counts))

    #create mapping file if one doesn't exist
    genericmapfile = args.out + '.mapping_file.txt'
    amptklib.CreateGenericMappingFile(Barcodes, {}, FwdPrimer, RevPrimer,
                                      genericmapfile, BarcodeCount)

    #compress the output to save space
    FinalDemux = Demux + '.gz'
    amptklib.Fzip(Demux, FinalDemux, cpus)
    amptklib.removefile(Demux)

    if args.cleanup:
        amptklib.SafeRemove(tmpdir)

    #get file size
    filesize = os.path.getsize(FinalDemux)
    readablesize = amptklib.convertSize(filesize)
    amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
    amptklib.log.info("Mapping file: %s" % genericmapfile)
    print("-------------------------------------------------------")
    if 'darwin' in sys.platform:
        print(col.WARN + "\nExample of next cmd: " + col.END +
              "amptk cluster -i %s -o out\n" % (FinalDemux))
    else:
        print("\nExample of next cmd: amptk cluster -i %s -o out\n" %
              (FinalDemux))
Beispiel #11
0
def main(args):
    global FwdPrimer, RevPrimer, Barcodes, tmpdir
    parser = argparse.ArgumentParser(
        prog='amptk-process_ion.py',
        usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu",
        description=
        '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        '--sff',
                        '--fasta',
                        '--bam',
                        dest='fastq',
                        required=True,
                        help='BAM/FASTQ/SFF/FASTA file')
    parser.add_argument('-q', '--qual', help='QUAL file (if -i is FASTA)')
    parser.add_argument('-o',
                        '--out',
                        dest="out",
                        default='ion',
                        help='Base name for output')
    parser.add_argument('-f',
                        '--fwd_primer',
                        dest="F_primer",
                        default='fITS7-ion',
                        help='Forward Primer')
    parser.add_argument('-r',
                        '--rev_primer',
                        dest="R_primer",
                        default='ITS4',
                        help='Reverse Primer')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument('-p',
                        '--pad',
                        default='off',
                        choices=['on', 'off'],
                        help='Pad with Ns to a set length')
    parser.add_argument('--primer_mismatch',
                        default=2,
                        type=int,
                        help='Number of mis-matches in primer')
    parser.add_argument('--barcode_mismatch',
                        default=0,
                        type=int,
                        help='Number of mis-matches in barcode')
    parser.add_argument(
        '--barcode_fasta',
        default='ionxpress',
        help='FASTA file containing Barcodes (Names & Sequences)')
    parser.add_argument('--reverse_barcode',
                        help='FASTA file containing 3 prime Barocdes')
    parser.add_argument('-b',
                        '--list_barcodes',
                        dest="barcodes",
                        default='all',
                        help='Enter Barcodes used separated by commas')
    parser.add_argument('--min_len',
                        default=100,
                        type=int,
                        help='Minimum read length to keep')
    parser.add_argument('-l',
                        '--trim_len',
                        default=300,
                        type=int,
                        help='Trim length for reads')
    parser.add_argument(
        '--full_length',
        action='store_true',
        help='Keep only full length reads (no trimming/padding)')
    parser.add_argument('--mult_samples',
                        dest="multi",
                        default='False',
                        help='Combine multiple samples (i.e. FACE1)')
    parser.add_argument('--ion',
                        action='store_true',
                        help='Input data is Ion Torrent')
    parser.add_argument('--454', action='store_true', help='Input data is 454')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH EXE')
    args = parser.parse_args(args)

    args.out = re.sub(r'\W+', '', args.out)

    log_name = args.out + '.amptk-demux.log'
    if os.path.isfile(log_name):
        os.remove(log_name)
    FNULL = open(os.devnull, 'w')
    amptklib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of CPUs to use
    if not args.cpus:
        cpus = multiprocessing.cpu_count()
    else:
        cpus = args.cpus

    #parse a mapping file or a barcode fasta file, primers, etc get setup
    #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
    barcode_file = args.out + ".barcodes_used.fa"
    rev_barcode_file = args.out + '.revbarcodes_used.fa'
    amptklib.SafeRemove(barcode_file)
    amptklib.SafeRemove(rev_barcode_file)

    #check if mapping file passed, use this if present, otherwise use command line arguments
    SampleData = {}
    Barcodes = {}
    RevBarcodes = {}
    if args.mapping_file:
        if not os.path.isfile(args.mapping_file):
            amptklib.log.error("Mapping file not found: %s" %
                               args.mapping_file)
            sys.exit(1)
        SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(
            args.mapping_file)
        genericmapfile = args.mapping_file
    else:  #no mapping file, so create dictionaries from barcode fasta files
        if args.barcode_fasta == 'ionxpress':
            #get script path and barcode file name
            pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__),
                                        'DB', 'ionxpress_barcodes.fa')
        elif args.barcode_fasta == 'ioncode':
            pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__),
                                        'DB', 'ioncode_barcodes.fa')
        if args.barcode_fasta == 'ionxpress' or args.barcode_fasta == 'ioncode':
            if args.barcodes == "all":
                if args.multi == 'False':
                    shutil.copyfile(pgm_barcodes, barcode_file)
                else:
                    with open(barcode_file, 'w') as barcodeout:
                        with open(pgm_barcodes, 'r') as input:
                            for rec in SeqIO.parse(input, 'fasta'):
                                outname = args.multi + '.' + rec.id
                                barcodeout.write(">%s\n%s\n" %
                                                 (outname, rec.seq))
            else:
                bc_list = args.barcodes.split(",")
                inputSeqFile = open(pgm_barcodes, "rU")
                SeqRecords = SeqIO.to_dict(SeqIO.parse(inputSeqFile, "fasta"))
                for rec in bc_list:
                    name = "BC." + rec
                    seq = SeqRecords[name].seq
                    if args.multi != 'False':
                        outname = args.multi + '.' + name
                    else:
                        outname = name
                    outputSeqFile = open(barcode_file, "a")
                    outputSeqFile.write(">%s\n%s\n" % (outname, seq))
                outputSeqFile.close()
                inputSeqFile.close()
        else:
            #check for multi_samples and add if necessary
            if args.multi == 'False':
                shutil.copyfile(args.barcode_fasta, barcode_file)
                if args.reverse_barcode:
                    shutil.copyfile(args.reverse_barcode, rev_barcode_file)
            else:
                with open(barcode_file, 'w') as barcodeout:
                    with open(args.barcode_fasta, 'r') as input:
                        for rec in SeqIO.parse(input, 'fasta'):
                            outname = args.multi + '.' + rec.id
                            barcodeout.write(">%s\n%s\n" % (outname, rec.seq))
                if args.reverse_barcode:
                    with open(rev_barcode_file, 'w') as barcodeout:
                        with open(args.reverse_barcode, 'r') as input:
                            for rec in SeqIO.parse(input, 'fasta'):
                                outname = args.multi + '.' + rec.id
                                barcodeout.write(">%s\n%s\n" %
                                                 (outname, rec.seq))

        #parse primers here so doesn't conflict with mapping primers
        #look up primer db otherwise default to entry
        if args.F_primer in amptklib.primer_db:
            FwdPrimer = amptklib.primer_db.get(args.F_primer)
            amptklib.log.info(
                "{:} fwd primer found in AMPtk primer db, setting to: {:}".
                format(args.F_primer, FwdPrimer))
        else:
            FwdPrimer = args.F_primer
            amptklib.log.info(
                "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.F_primer))
        if args.R_primer in amptklib.primer_db:
            RevPrimer = amptklib.primer_db.get(args.R_primer)
            amptklib.log.info(
                "{:} rev primer found in AMPtk primer db, setting to: {:}".
                format(args.R_primer, RevPrimer))
        else:
            RevPrimer = args.R_primer
            amptklib.log.info(
                "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.R_primer))

    #check if input is compressed
    gzip_list = []
    if args.fastq.endswith('.gz'):
        gzip_list.append(os.path.abspath(args.fastq))
    if gzip_list:
        amptklib.log.info("Gzipped input files detected, uncompressing")
        for file in gzip_list:
            file_out = file.replace('.gz', '')
            amptklib.Funzip(file, file_out, cpus)
        args.fastq = args.fastq.replace('.gz', '')

    #if SFF file passed, convert to FASTQ with biopython
    if args.fastq.endswith(".sff"):
        if args.barcode_fasta == 'ionxpress':
            if not args.mapping_file:
                amptklib.log.error(
                    "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data"
                )
                sys.exit(1)
        amptklib.log.info("SFF input detected, converting to FASTQ")
        SeqIn = args.out + '.sff.extract.fastq'
        SeqIO.convert(args.fastq, "sff-trim", SeqIn, "fastq")
    elif args.fastq.endswith(".fas") or args.fastq.endswith(
            ".fasta") or args.fastq.endswith(".fa"):
        if not args.qual:
            amptklib.log.error(
                "FASTA input detected, however no QUAL file was given.  You must have FASTA + QUAL files"
            )
            sys.exit(1)
        else:
            if args.barcode_fasta == 'ionxpress':
                if not args.mapping_file:
                    amptklib.log.error(
                        "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data"
                    )
                    sys.exit(1)
            SeqIn = args.out + '.fastq'
            amptklib.log.info("FASTA + QUAL detected, converting to FASTQ")
            amptklib.faqual2fastq(args.fastq, args.qual, SeqIn)
    elif args.fastq.endswith('.bam'):
        #so we can convert natively with pybam, however it is 10X slower than bedtools/samtools
        #since samtools is fastest, lets use that if exists, if not then bedtools, else default to pybam
        amptklib.log.info("Converting Ion Torrent BAM file to FASTQ")
        SeqIn = args.out + '.fastq'
        if amptklib.which('samtools'):
            cmd = ['samtools', 'fastq', '-@', str(cpus), args.fastq]
            amptklib.runSubprocess2(cmd, amptklib.log, SeqIn)
        else:
            if amptklib.which('bedtools'):
                cmd = [
                    'bedtools', 'bamtofastq', '-i', args.fastq, '-fq', SeqIn
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:  #default to pybam
                amptklib.bam2fastq(args.fastq, SeqIn)
    else:
        SeqIn = args.fastq

    #start here to process the reads, first reverse complement the reverse primer
    catDemux = args.out + '.demux.fq'
    origRevPrimer = RevPrimer
    RevPrimer = amptklib.RevComp(RevPrimer)
    amptklib.log.info("Foward primer: %s,  Rev comp'd rev primer: %s" %
                      (FwdPrimer, RevPrimer))

    #then setup barcode dictionary
    if len(Barcodes) < 1:
        Barcodes = amptklib.fasta2barcodes(barcode_file, False)

    #setup for looking for reverse barcode
    if len(RevBarcodes) < 1 and args.reverse_barcode:
        if not os.path.isfile(args.reverse_barcode):
            amptklib.log.info("Reverse barcode is not a valid file, exiting")
            sys.exit(1)
        shutil.copyfile(args.reverse_barcode, rev_barcode_file)
        RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, True)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    orig_total = amptklib.countfastq(SeqIn)
    size = amptklib.checkfastqsize(SeqIn)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #create tmpdir and split input into n cpus
    tmpdir = args.out.split('.')[0] + '_' + str(os.getpid())
    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    amptklib.log.info(
        'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'
        .format(args.min_len, args.trim_len))

    if cpus > 1:
        #split fastq file
        amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus))
        amptklib.split_fastq(SeqIn, orig_total, tmpdir, cpus * 2)
        #now get file list from tmp folder
        file_list = []
        for file in os.listdir(tmpdir):
            if file.endswith(".fq"):
                file = os.path.join(tmpdir, file)
                file_list.append(file)
        #finally process reads over number of cpus
        amptklib.runMultiProgress(processRead, file_list, cpus, args=args)
    else:
        shutil.copyfile(SeqIn, os.path.join(tmpdir, 'chunk.fq'))
        processRead(os.path.join(tmpdir, 'chunk.fq'), args=args)

    print("-------------------------------------------------------")
    #Now concatenate all of the demuxed files together
    amptklib.log.info("Concatenating Demuxed Files")

    tmpDemux = args.out + '.tmp.demux.fq'
    with open(tmpDemux, 'w') as outfile:
        for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')):
            if filename == tmpDemux:
                continue
            with open(filename, 'r') as readfile:
                shutil.copyfileobj(readfile, outfile)
    #parse the stats
    finalstats = [0, 0, 0, 0, 0, 0, 0]
    for file in os.listdir(tmpdir):
        if file.endswith('.stats'):
            with open(os.path.join(tmpdir, file), 'r') as statsfile:
                line = statsfile.readline()
                line = line.rstrip()
                newstats = line.split(',')
                newstats = [int(i) for i in newstats]
                for x, num in enumerate(newstats):
                    finalstats[x] += num

    #clean up tmp folder
    shutil.rmtree(tmpdir)

    #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split
    amptklib.fastqreindex(tmpDemux, catDemux)
    os.remove(tmpDemux)

    amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads')
    if args.reverse_barcode:
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] -
                                         finalstats[2] - finalstats[4]) +
                          ' valid Fwd and Rev Barcodes')
    else:
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) +
                          ' valid Barcode')
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] -
                                         finalstats[2]) +
                          ' Fwd Primer found, {0:,}'.format(finalstats[3]) +
                          ' Rev Primer found')
    amptklib.log.info('{0:,}'.format(finalstats[5]) +
                      ' discarded too short (< %i bp)' % args.min_len)
    amptklib.log.info('{0:,}'.format(finalstats[6]) + ' valid output reads')

    #now loop through data and find barcoded samples, counting each.....
    BarcodeCount = {}
    with open(catDemux, 'r') as input:
        header = itertools.islice(input, 0, None, 4)
        for line in header:
            ID = line.split("=", 1)[-1].split(";")[0]
            if ID not in BarcodeCount:
                BarcodeCount[ID] = 1
            else:
                BarcodeCount[ID] += 1

    #now let's count the barcodes found and count the number of times they are found.
    barcode_counts = "%22s:  %s" % ('Sample', 'Count')
    for k, v in natsorted(list(BarcodeCount.items()),
                          key=lambda k_v: k_v[1],
                          reverse=True):
        barcode_counts += "\n%22s:  %s" % (k, str(BarcodeCount[k]))
    amptklib.log.info("Found %i barcoded samples\n%s" %
                      (len(BarcodeCount), barcode_counts))

    #create a generic mappingfile for downstream processes
    genericmapfile = args.out + '.mapping_file.txt'
    if not args.mapping_file:
        amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer,
                                          origRevPrimer, genericmapfile,
                                          BarcodeCount)
    else:
        amptklib.updateMappingFile(args.mapping_file, BarcodeCount,
                                   genericmapfile)

    #compress the output to save space
    FinalDemux = catDemux + '.gz'
    amptklib.Fzip(catDemux, FinalDemux, cpus)
    amptklib.removefile(catDemux)
    if gzip_list:
        for file in gzip_list:
            file = file.replace('.gz', '')
            amptklib.removefile(file)

    #get file size
    filesize = os.path.getsize(FinalDemux)
    readablesize = amptklib.convertSize(filesize)
    amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
    amptklib.log.info("Mapping file: %s" % genericmapfile)

    print("-------------------------------------------------------")
    if 'darwin' in sys.platform:
        print(col.WARN + "\nExample of next cmd: " + col.END +
              "amptk cluster -i %s -o out\n" % (FinalDemux))
    else:
        print("\nExample of next cmd: amptk cluster -i %s -o out\n" %
              (FinalDemux))
Beispiel #12
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-unoise2.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UNOISE2 algorithm.
		Requires USEARCH9 by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-m',
                        '--minsize',
                        default='8',
                        help='Min size to keep for denoising')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="Biological OTU Clustering Percent")
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME2 REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-unoise2.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_',
        '--sizeout', '--output', derep_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run de-noiser UNOISE2
    amptklib.log.info("Denoising reads with UNOISE2")
    unoise_out = os.path.join(tmp, base + '.EE' + args.maxee + '.unoise.fa')
    cmd = [
        usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '-minampsize',
        args.minsize, '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(unoise_out)
    amptklib.log.info('{0:,}'.format(total) + ' denoised sequences')

    #strip N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.fa')
    amptklib.fasta_strip_padding(unoise_out, otu_clean)

    #run optional uchime_ref
    if not args.uchime_ref:
        uchime_out = otu_clean
    else:
        uchime_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.uchime.otus.fa')
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = otu_clean
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            uchime_db = os.path.abspath(args.uchime_ref)
        #now run chimera filtering if all checks out
        if not os.path.isfile(uchime_out):
            amptklib.log.info("Chimera Filtering (VSEARCH)")
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean,
                '--db', uchime_db, '--nonchimeras', uchime_out, '--threads',
                str(cpus)
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(uchime_out)
            amptklib.log.info('{0:,}'.format(total) + ' OTUs passed')

    #inferred sequences
    iSeqs = base + '.ASVs.fa'
    amptklib.fastarename(uchime_out, 'ASV', iSeqs)

    #Filter out ASVs in wrong orientation
    amptklib.log.info('Validating ASV orientation')
    passingOTUs = os.path.join(tmp, base + '.passed.asvs.fa')
    numKept, numDropped = amptklib.validateorientation(tmp, derep_out,
                                                       uchime_out, passingOTUs)
    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(
        numKept, numDropped))

    #build OTU table with iSeqs
    uc_iSeq_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    iSeq_otu_table = base + '.otu_table.txt'
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to ASVs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', passingOTUs, '--uc', uc_iSeq_out, '--otutabout',
        iSeq_otu_table, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_iSeq_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #now cluster to biological OTUs with UCLUST
    radius = float(args.pct_otu) / 100.
    amptklib.log.info(
        "Clustering denoised sequences into biological OTUs at %s%%" %
        args.pct_otu)
    uclust_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uclust.fa')
    cmd = [
        'vsearch', '--cluster_smallmem', passingOTUs, '--centroids',
        uclust_out, '--id',
        str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none',
        '--usersort', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(uclust_out)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')

    #determine where denoised sequences clustered
    ClusterComp = base + '.ASVs2clusters.txt'
    iSeqmap = base + '.unoise_map.uc'
    cmd = [
        usearch, '-usearch_global', passingOTUs, '-db', uclust_out, '-id',
        str(radius), '-uc', iSeqmap, '-strand', 'plus', '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    iSeqMapped = {}
    with open(iSeqmap, 'r') as mapping:
        for line in mapping:
            line = line.replace('\n', '')
            cols = line.split('\t')
            OTU = cols[9]
            Hit = cols[8]
            if not OTU in iSeqMapped:
                iSeqMapped[OTU] = [Hit]
            else:
                iSeqMapped[OTU].append(Hit)
    with open(ClusterComp, 'w') as clusters:
        clusters.write('OTU\tASVs\n')
        for k, v in natsorted(list(iSeqMapped.items())):
            clusters.write('%s\t%s\n' % (k, ', '.join(v)))

    #now map reads back to OTUs and build OTU table
    uc_out = os.path.join(tmp,
                          base + '.EE' + args.maxee + '.cluster.mapping.uc')
    otu_table = os.path.join(
        tmp, base + '.EE' + args.maxee + '.cluster.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', uclust_out, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(uclust_out, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.cluster.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)
    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("UNOISE2 Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Amplicon sequence variants: %s" % passingOTUs)
    print("ASV OTU Table: %s" % iSeq_otu_table)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("ASVs 2 OTUs: %s" % ClusterComp)
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Beispiel #13
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-drop.py',
        description='''Script that drops OTUs and then creates OTU table''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='OTUs in FASTA format')
    parser.add_argument('-r',
                        '--reads',
                        required=True,
                        help='Demuxed reads FASTQ format')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-l',
                        '--list',
                        nargs='+',
                        help='Input list of (BC) names to remove')
    parser.add_argument('-f',
                        '--file',
                        help='File containing list of names to remove')
    args = parser.parse_args(args)

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'otus' in args.input:
            base = os.path.basename(args.input).split('.otus')[0]
        else:
            base = os.path.basename(args.input).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-drop.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()

    #check the list or file parameters, one of them must have something
    if not args.list:
        if not args.file:
            amptklib.log.error(
                "Error, you must specifiy a list of OTU names or a file containing names"
            )
            sys.exit(1)
    if not args.file:
        if not args.list:
            amptklib.log.error(
                "Error, you must specifiy a list of OTU names or a file containing names"
            )
            sys.exit(1)
    if args.list and args.file:
        amptklib.log.error(
            "Error, you must specifiy either list of OTU names or a file containing OTU names, not both"
        )
        sys.exit(1)
    if args.file:
        count = amptklib.line_count(args.file)
        #load in list of names to remove
        with open(args.file, 'r') as input:
            lines = [line.rstrip('\n') for line in input]
    if args.list:
        count = len(args.list)
        lines = args.list
    #make sure it is a set, faster lookup
    dropList = set(lines)

    #load data
    total = amptklib.countfasta(args.input)
    amptklib.log.info("Loading %i OTUs" % total)

    #load in the fasta file, change if in dictionary and output to stdout
    amptklib.log.info("Dropping %i OTUs" % count)
    newOTUs = base + '.cleaned.otus.fa'
    with open(newOTUs, 'w') as otus:
        with open(args.input, 'r') as fasta:
            for rec in SeqIO.parse(fasta, 'fasta'):
                if not rec.id in dropList:
                    SeqIO.write(rec, otus, 'fasta')

    #now make new OTU table
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    newTable = base + '.cleaned.otu_table.txt'
    tmpReads = base + '.reads.tmp'
    uc_out = base + '.mapping.uc'
    cmd = [
        'vsearch', '--fastq_filter', args.reads, '--fastaout', tmpReads,
        '--fastq_qmax', '55'
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    cmd = [
        'vsearch', '--usearch_global', tmpReads, '--strand', 'plus', '--id',
        '0.97', '--db', newOTUs, '--uc', uc_out, '--otutabout', newTable
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count OTUs
    otu_count = amptklib.countfasta(newOTUs)
    amptklib.log.info('{0:,}'.format(otu_count) + ' OTUs remaining')

    #count reads mapped
    total = amptklib.line_count(uc_out)
    orig_total = amptklib.countfasta(tmpReads)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("Clustered OTUs: %s" % newOTUs)
    print("OTU Table: %s" % newTable)
    print("-------------------------------------------------------")

    #cleanup
    amptklib.removefile(tmpReads)
    amptklib.removefile(uc_out)

    otu_print = newOTUs.split('/')[-1]
    tab_print = newTable.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Beispiel #14
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-lulu.py',
        description=
        '''Script runs OTU table post processing LULU to identify low abundance error OTUs''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--otu_table',
                        required=True,
                        help='Input OTU table')
    parser.add_argument('-f',
                        '--fasta',
                        required=True,
                        help='Input OTUs (multi-fasta)')
    parser.add_argument('-o', '--out', help='Output folder basename')
    parser.add_argument('--min_ratio_type',
                        default='min',
                        choices=['min', 'avg'],
                        help="LULU minimum ratio threshold")
    parser.add_argument('--min_ratio',
                        default=1,
                        type=int,
                        help="LULU minimum ratio")
    parser.add_argument('--min_match',
                        default=84,
                        type=int,
                        help="LULU minimum match percent identity")
    parser.add_argument('--min_relative_cooccurence',
                        default=95,
                        type=int,
                        help="LULU minimum relative cooccurance")
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    args = parser.parse_args(args)

    #get location of R script
    parentdir = os.path.join(os.path.dirname(amptklib.__file__))
    luluScript = os.path.join(parentdir, 'runLULU.R')

    if not args.out:
        #get base name of files
        if 'otu_table' in args.otu_table:
            base = os.path.basename(args.otu_table).split(".otu_table")[0]
        elif 'final.txt' in args.otu_table:
            base = os.path.basename(args.otu_table).split(".final")[0]
        else:
            base = os.path.basename(args.otu_table).split(".txt")[0]
    else:
        base = args.out

    #remove logfile if exists
    log_name = base + '.amptk-lulu.log'
    if os.path.isfile(log_name):
        amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    amptklib.versionDependencyChecks('usearch9')
    #check dependencies
    programs = ['Rscript', 'vsearch']
    amptklib.CheckDependencies(programs)
    Rversions = amptklib.checkRversion()
    if Rversions[3] == '0.0.0':
        amptklib.log.info("R v%s installed, LULU not installed")
        sys.exit(1)
    else:
        amptklib.log.info("R v%s; LULU v%s" % (Rversions[0], Rversions[3]))

    #this is a simple wrapper for an R script so easier to run from amptk menu
    tmpdir = 'lulu_' + str(os.getpid())
    if not os.path.isdir(tmpdir):
        os.makedirs(tmpdir)

    #generate the match list using the minimum match pident
    match_file = os.path.join(tmpdir, 'match_file.txt')
    amptklib.log.info("Loading {:,} OTUs".format(
        amptklib.countfasta(args.fasta)))
    amptklib.log.info(
        "Generating pairwise percent identity between OTUs using VSEARCH at {:}% identity"
        .format(args.min_match))
    cmd = [
        'vsearch', '--usearch_global',
        os.path.abspath(args.fasta), '--db',
        os.path.abspath(args.fasta), '--self', '--id',
        str(args.min_match / 100), '--iddef', '1', '--userout', match_file,
        '--userfields', 'query+target+id', '--maxaccepts', '0', '--query_cov',
        '.9', '--maxhits', '10'
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #now run LULU in R
    LULU_log = os.path.join(tmpdir, 'LULU-R.log')
    lulu_otu_table = base + '.lulu.otu_table.txt'
    dropList = os.path.join(tmpdir, 'droplist.txt')
    MapData = base + '.lulu.otu-map.txt'
    amptklib.log.info("Running LULU algorithm")
    cmd = [
        'Rscript', '--vanilla', luluScript,
        os.path.abspath(args.otu_table),
        os.path.abspath(match_file), args.min_ratio_type,
        str(args.min_ratio),
        str(args.min_match),
        str(args.min_relative_cooccurence / 100), lulu_otu_table, dropList,
        MapData
    ]
    amptklib.runSubprocess4(cmd, amptklib.log, LULU_log)

    #get updated OTUs
    remove = []
    with open(dropList, 'rU') as dropped:
        for line in dropped:
            remove.append(line.rstrip())
    lulu_otus = base + '.lulu.otus.fa'
    with open(lulu_otus, 'w') as output:
        with open(args.fasta, 'rU') as infasta:
            for record in SeqIO.parse(infasta, 'fasta'):
                if not record.id in remove:
                    output.write('>%s\n%s\n' % (record.id, record.seq))
    amptklib.log.info(
        "LULU has merged {:,} OTUs, output data contains {:,} OTUs".format(
            len(remove), amptklib.countfasta(lulu_otus)))
    amptklib.log.info("LULU OTU table post processing finished\n\
----------------------------------\n\
OTU table:  {:}\n\
OTU FASTA:  {:}\n\
LULU map:   {:}\n\
----------------------------------".format(lulu_otu_table, lulu_otus, MapData))
    if 'win32' in sys.platform:
        print(
            "\nExample of next cmd: amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n"
            % (lulu_otus, lulu_otu_table))
    else:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" %
              (lulu_otus, lulu_otu_table))
    if not args.debug:
        if os.path.isdir(tmpdir):
            shutil.rmtree(tmpdir)
    print("-------------------------------------------------------")