Exemple #1
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-filter.py',
        description='''Script inspects output of amptk-OTU_cluster.py and 
		determines useful threshold for OTU output based on a spike-in 
		mock community.''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--otu_table',
                        required=True,
                        help='Input OTU table')
    parser.add_argument('-f',
                        '--fasta',
                        required=True,
                        help='Input OTUs (multi-fasta)')
    parser.add_argument('-b',
                        '--mock_barcode',
                        help='Barocde of Mock community')
    parser.add_argument('-p',
                        '--index_bleed',
                        help='Index Bleed filter. Default: auto')
    parser.add_argument('-t',
                        '--threshold',
                        default='max',
                        choices=['sum', 'max', 'top25', 'top10', 'top5'],
                        help='Threshold to use when calculating index-bleed')
    parser.add_argument(
        '-c',
        '--calculate',
        default='all',
        choices=['all', 'in'],
        help='Calculate index-bleed, if synthetic mock use all otherwise use in'
    )
    parser.add_argument('-s',
                        '--subtract',
                        default=0,
                        help='Threshold to subtract')
    parser.add_argument('-n',
                        '--normalize',
                        default='y',
                        choices=['y', 'n'],
                        help='Normalize OTU table prior to filtering')
    parser.add_argument('-m', '--mc', help='Multi-FASTA mock community')
    parser.add_argument(
        '-d',
        '--drop',
        nargs='+',
        help='samples to drop from table after index-bleed filtering')
    parser.add_argument('--ignore',
                        nargs='+',
                        help='Ignore OTUs during index-bleed')
    parser.add_argument('--delimiter',
                        default='tsv',
                        choices=['csv', 'tsv'],
                        help='Delimiter')
    parser.add_argument('--col_order',
                        nargs='+',
                        dest="col_order",
                        help='Provide space separated list')
    parser.add_argument('--keep_mock',
                        action='store_true',
                        help='Keep mock sample in OTU table (Default: False)')
    parser.add_argument('--show_stats',
                        action='store_true',
                        help='Show stats datatable STDOUT')
    parser.add_argument('--negatives',
                        nargs='+',
                        help='Negative Control Sample names')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('--min_reads_otu',
                        default=2,
                        type=int,
                        help='Minimum number of reads per OTU for experiment')
    parser.add_argument(
        '--min_samples_otu',
        default=1,
        type=int,
        help='Minimum number of samples per OTU for experiment')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    if not args.out:
        #get base name of files
        base = args.otu_table.split(".otu_table")[0]
    else:
        base = args.out

    #remove logfile if exists
    log_name = base + '.amptk-filter.log'
    amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #check if otu_table is empty
    amptklib.log.info("Loading OTU table: %s" % args.otu_table)
    check = os.stat(args.otu_table).st_size
    if check == 0:
        amptklib.log.error("Input OTU table is empty")
        sys.exit(1)
    #get the OTU header info (depending on how OTU table was constructed, this might be different, so find it as you need for indexing)
    with open(args.otu_table, 'r') as f:
        first_line = f.readline()
        OTUhead = first_line.split('\t')[0]

    if args.delimiter == 'csv':
        delim = str(',')
        ending = '.csv'
    elif args.delimiter == 'tsv':
        delim = str('\t')
        ending = '.txt'

    #setup outputs
    sorted_table = base + '.sorted' + ending
    normal_table_pct = base + '.normalized.pct' + ending
    normal_table_nums = base + '.normalized.num' + ending
    subtract_table = base + '.normalized.subtract' + ending
    filtered_table = base + '.normalized' + ending
    final_table = base + '.final' + ending
    final_binary_table = base + '.final.binary' + ending
    stats_table = base + '.stats' + ending

    #load OTU table into pandas DataFrame
    df = pd.read_csv(args.otu_table, sep='\t')
    df.set_index(OTUhead, inplace=True)
    headers = df.columns.values.tolist()
    if headers[-1] == 'taxonomy' or headers[-1] == 'Taxonomy':
        otuDict = df[headers[-1]].to_dict()
        del df[headers[-1]]
    else:
        otuDict = False

    #parse OTU table to get count data for each OTU
    AddCounts = {}
    OTUcounts = df.sum(1)
    for x in OTUcounts.index:
        AddCounts[x] = int(OTUcounts[x])

    #now add counts to fasta header
    FastaCounts = base + '.otus.counts.fa'
    OTU_tax = {}
    with open(FastaCounts, 'w') as outfile:
        with open(args.fasta, 'r') as infile:
            for rec in SeqIO.parse(infile, 'fasta'):
                if ';' in rec.id:  #this should mean there is taxonomy, so split it
                    ID = rec.id.split(';', 1)[0]
                    tax = rec.id.split(';', 1)[-1]
                    OTU_tax[ID] = tax
                    if ID in AddCounts:
                        count = AddCounts.get(ID)
                    else:
                        count = 0
                    outfile.write('>%s;size=%i\n%s\n' % (ID, count, rec.seq))
                else:  #no tax, just process
                    if rec.id in AddCounts:
                        count = AddCounts.get(rec.id)
                    else:
                        count = 0
                    outfile.write('>%s;size=%i\n%s\n' %
                                  (rec.id, count, rec.seq))

    amptklib.log.info(
        'OTU table contains {:,} samples, {:,} OTUs, and {:,} reads counts'.
        format(len(df.columns.values.tolist()), len(df.index),
               int(df.values.sum())))

    #setup output files/variables
    mock_out = base + '.mockmap.txt'

    if args.mock_barcode:  #if user passes a column name for mock
        #check if mock barcode is valid
        validBCs = df.columns.values.tolist()
        if not args.mock_barcode in validBCs:
            amptklib.log.error("%s not a valid barcode." % args.mock_barcode)
            amptklib.log.error("Valid barcodes: %s" % (' '.join(validBCs)))
            sys.exit(1)
        if args.col_order and not args.mock_barcode in args.col_order:
            amptklib.log.error("Error: %s not listed in --col_order." %
                               args.mock_barcode)
            sys.exit(1)
        #make sure there is a --mc passed here otherwise throw error
        if not args.mc:
            amptklib.log.error(
                "If using the -b,--barcode option you must specify a fasta file of mock community via the --mc option"
            )
            sys.exit(1)
        #get default mock community value
        if args.mc == "mock3":
            mock = os.path.join(parentdir, 'DB', 'amptk_mock3.fa')
        elif args.mc == "mock2":
            mock = os.path.join(parentdir, 'DB', 'amptk_mock2.fa')
        elif args.mc == "mock1":
            mock = os.path.join(parentdir, 'DB', 'amptk_mock1.fa')
        elif args.mc == "synmock":
            mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa')
        else:
            mock = os.path.abspath(args.mc)

        #open mock community fasta and count records
        mock_ref_count = amptklib.countfasta(mock)

        #load OTU lengths into dictionary
        SeqLength = amptklib.fastalen2dict(args.fasta)

        #map OTUs to mock community, this is fast enough, but running twice, first to get only top hit, then
        amptklib.log.info("Mapping OTUs to Mock Community (USEARCH)")
        cmd = [
            usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.65',
            '-db', FastaCounts, '-userout', mock_out, '-userfields',
            'query+target+id+ql+tl+alnlen+caln+mism+diffs', '-maxaccepts', '0',
            '-maxrejects', '0'
        ]
        amptklib.runSubprocess(cmd, amptklib.log)

        #generate dictionary for name change
        '''
		If args.calculate is set to all, that means the script is trying to measure a synthetic
		mock of some kind.  if that is the case, then chimeras are < 95% identical to mock members
		and variants would be hits in between, i.e 95% > but not the best hit.
		'''
        Results = {}
        errorrate = {}
        with open(mock_out, 'r') as map:
            for line in map:
                line = line.replace('\n', '')
                cols = line.split('\t')
                MockID = cols[0]
                hit = cols[1].split(';size=')
                otuID = hit[0]
                abundance = int(hit[1])
                pident = float(cols[2])
                length = int(cols[4])
                mism = int(cols[7])
                diffs = int(cols[8])
                score = abundance * pident * length
                if not otuID in errorrate:
                    errorrate[otuID] = [MockID, diffs]
                else:
                    olderror = errorrate.get(otuID)
                    if diffs < olderror[1]:
                        errorrate[otuID] = [MockID, diffs]
                if not MockID in Results:
                    Results[MockID] = [(otuID, abundance, pident, length, mism,
                                        diffs, score)]
                else:
                    Results[MockID].append(
                        (otuID, abundance, pident, length, mism, diffs, score))

        found_dict = {}
        chimeras = []
        variants = []
        missing = []
        for k, v in natsorted(list(Results.items())):
            besthit = []
            #v is a list of tuples of results, parse through to get best hit
            for y in v:
                if y[2] >= 97.0:
                    besthit.append(y)
                elif y[2] >= 95.0 and y[2] < 97.0:
                    if not y[0] in variants:
                        variants.append(y[0])
                else:
                    if not y[0] in chimeras:
                        chimeras.append(y[0])
            if len(besthit) > 0:
                besthit.sort(key=lambda x: x[1], reverse=True)
                best = sorted(besthit[:3], key=lambda x: x[6], reverse=True)
                found_dict[k] = best[0]
            else:
                missing.append(k)

        #make name change dict
        annotate_dict = {}
        seen = []
        for k, v in natsorted(list(found_dict.items())):
            ID = v[0].replace('_chimera', '')
            newID = k + '_pident=' + str(v[2]) + '_' + v[0]
            annotate_dict[ID] = newID
            if not v[0] in seen:
                seen.append(v[0])
        if args.calculate == 'all':
            chimeras = [x for x in chimeras if x not in seen]
            variants = [x for x in variants if x not in seen]
            for i in chimeras:
                annotate_dict[i] = i + '_suspect_mock_chimera'
            for x in variants:
                annotate_dict[x] = x + '_suspect_mock_variant'
        if len(missing) > 0:
            amptklib.log.info("%i mock missing: %s" %
                              (len(missing), ', '.join(missing)))
    else:
        otu_new = args.fasta

    #rename OTUs
    if args.mock_barcode:
        df.rename(index=annotate_dict, inplace=True)

    #sort the table
    df2 = df.reindex(index=natsorted(df.index))
    if not args.col_order:
        amptklib.log.info("Sorting OTU table naturally")
        df = df2.reindex(columns=natsorted(df2.columns))
    else:
        amptklib.log.info(
            "Sorting OTU table by user defined order (--col_order)")
        col_headers = args.col_order
        #check if all names in headers or not
        for i in col_headers:
            if not i in df2.columns.values:
                col_headers.remove(i)
        df = df2.reindex(columns=col_headers)
    SortedTable = df
    if otuDict:
        df['Taxonomy'] = pd.Series(otuDict)
        df.to_csv(sorted_table, sep=delim)
        del df['Taxonomy']
    else:
        df.to_csv(sorted_table, sep=delim)

    #get sums of columns
    fs = df.sum(axis=0)
    #fs.to_csv('reads.per.sample.csv')
    otus_per_sample_original = df[df > 0].count(axis=0, numeric_only=True)
    filtered = pd.DataFrame(df, columns=fs.index)
    filt2 = filtered.loc[(filtered != 0).any(1)]
    tos = filt2.sum(axis=1)
    fotus = tos[
        tos >= args.
        min_reads_otu]  #valid allele must be found atleast from than 2 times, i.e. no singletons
    if len(fotus.index) < len(tos.index):
        diff = len(tos.index) - len(fotus.index)
        amptklib.log.info(
            "Removing {:,} OTUs according to --min_reads_otu {:,}".format(
                diff, args.min_reads_otu))
    filt3 = pd.DataFrame(filt2, index=fotus.index)

    if args.normalize == 'y':
        #normalize the OTU table
        normal = filt3.truediv(fs)
        if otuDict:
            normal['Taxonomy'] = pd.Series(otuDict)
            normal.to_csv(normal_table_pct, sep=delim)
            del normal['Taxonomy']
        else:
            normal.to_csv(normal_table_pct, sep=delim)
        #normalize back to read counts, pretend 100,000 reads in each
        norm_round = np.round(normal.multiply(100000), decimals=0)
        if otuDict:
            norm_round['Taxonomy'] = pd.Series(otuDict)
            norm_round.to_csv(normal_table_nums, sep=delim)
            del norm_round['Taxonomy']
        else:
            norm_round.to_csv(normal_table_nums, sep=delim)
        amptklib.log.info(
            "Normalizing OTU table to number of reads per sample")
    else:
        norm_round = filt3

    if args.mock_barcode:
        #now calculate the index-bleed in both directions (into the mock and mock into the other samples)
        mock = []
        sample = []
        #get names from mapping
        for k, v in list(annotate_dict.items()):
            if not '_suspect_mock_' in v:
                mock.append(v)
        for i in norm_round.index:
            if not i in mock:
                sample.append(i)
        if args.ignore:
            mock = [x for x in mock if x not in args.ignore]
            sample = [x for x in sample if x not in args.ignore]
        #first calculate bleed out of mock community
        #slice normalized dataframe to get only mock OTUs from table
        mock_df = pd.DataFrame(norm_round, index=mock)
        #if there are samples to drop, make sure they aren't being used in this calculation
        if args.drop:
            mock_df.drop(args.drop, axis=1, inplace=True)
        #get total number of reads from mock OTUs from entire table
        total = np.sum(np.sum(mock_df, axis=None))
        #now drop the mock barcode sample
        mock_df.drop(args.mock_barcode, axis=1, inplace=True)
        #get number of reads that are result of bleed over
        bleed1 = np.sum(np.sum(mock_df, axis=None))
        #calculate rate of bleed by taking num reads bleed divided by the total
        bleed1max = bleed1 / float(total)

        #second, calculate bleed into mock community
        #get list of mock OTUs not found in any other sample -> these are likely chimeras
        mock_only = pd.DataFrame(norm_round,
                                 index=list(norm_round.index),
                                 columns=[args.mock_barcode])
        mock_OTUs_zeros = mock_only.loc[(mock_only == 0).all(axis=1)]
        theRest = [
            x for x in list(norm_round.columns.values)
            if x not in [args.mock_barcode]
        ]
        non_mocks = pd.DataFrame(norm_round, index=sample, columns=theRest)
        non_mock_zeros = non_mocks.loc[(non_mocks == 0).all(axis=1)]
        zeros = [
            x for x in list(non_mock_zeros.index)
            if x not in list(mock_OTUs_zeros.index)
        ]
        if len(zeros) > 0:
            amptklib.log.info(
                "Found {:,} mock chimeras (only in mock sample and not mapped to mock sequences) excluding from index-bleed calculation"
                .format(len(zeros)))
            amptklib.log.debug('{:}'.format(', '.join(zeros)))
        #now get updated list of samples, dropping chimeras
        samples_trimmed = [x for x in sample if x not in zeros]
        #slice the OTU table to get all OTUs that are not in mock community from the mock sample
        sample_df = pd.DataFrame(norm_round,
                                 index=samples_trimmed,
                                 columns=[args.mock_barcode])
        #get total number of reads that don't belong in mock
        bleed2 = np.sum(np.sum(sample_df, axis=None))
        #now pull the entire mock sample
        mock_sample = pd.DataFrame(norm_round, columns=[args.mock_barcode])
        #calcuate bleed into mock by taking num reads that don't belong divided by the total, so this is x% of bad reads in the mock
        bleed2max = bleed2 / float(np.sum(mock_sample.sum(axis=1)))
        #autocalculate the subtraction filter by taking the maximum value that doesn't belong
        subtract_num = max(sample_df.max())

        #get max values for bleed
        #can only use into samples measurement if not using synmock
        if args.calculate == 'all':
            if bleed1max > bleed2max:
                bleedfilter = math.ceil(bleed1max * 1000) / 1000
            else:
                bleedfilter = math.ceil(bleed2max * 1000) / 1000
            amptklib.log.info(
                "Index bleed, mock into samples: %f%%.  Index bleed, samples into mock: %f%%."
                % (bleed1max * 100, bleed2max * 100))
        else:
            bleedfilter = math.ceil(bleed2max * 1000) / 1000
            amptklib.log.info("Index bleed, samples into mock: %f%%." %
                              (bleed2max * 100))

    else:
        bleedfilter = args.index_bleed  #this is value needed to filter MiSeq, Ion is likely less, but shouldn't effect the data very much either way.

    if args.index_bleed:
        args.index_bleed = float(args.index_bleed)
        amptklib.log.info(
            "Overwriting auto detect index-bleed, setting to %f%%" %
            (args.index_bleed * 100))
        bleedfilter = args.index_bleed
    else:
        if bleedfilter:
            amptklib.log.info(
                "Will use value of %f%% for index-bleed OTU filtering." %
                (bleedfilter * 100))
        else:
            bleedfilter = 0  #no filtering if you don't pass -p or -b
            amptklib.log.info(
                "No spike-in mock (-b) or index-bleed (-p) specified, thus not running index-bleed filtering"
            )

    if bleedfilter > 0.05:
        amptklib.log.info(
            "Index bleed into samples is abnormally high (%f%%), if you have biological mock you should use `--calculate in`"
            % (bleedfilter * 100))

    #to combat barcode switching, loop through each OTU filtering out if less than bleedfilter threshold
    cleaned = []
    for row in norm_round.itertuples():
        result = [row[0]]
        if args.threshold == 'max':
            total = max(
                row[1:]
            )  #get max OTU count from table to calculate index bleed from.
        elif args.threshold == 'sum':
            total = sum(row[1:])
        elif args.threshold == 'top25':
            top = sorted(row[1:], key=int, reverse=True)
            topn = int(round(len(row[1:]) * 0.25))
            total = sum(top[:topn])
        elif args.threshold == 'top10':
            top = sorted(row[1:], key=int, reverse=True)
            topn = int(round(len(row[1:]) * 0.10))
            total = sum(top[:topn])
        elif args.threshold == 'top5':
            top = sorted(row[1:], key=int, reverse=True)
            topn = int(round(len(row[1:]) * 0.05))
            total = sum(top[:topn])
        sub = total * bleedfilter
        for i in row[1:]:
            if i < sub:
                i = 0
            result.append(i)
        cleaned.append(result)

    header = [OTUhead]
    for i in norm_round.columns:
        header.append(i)

    #create data frame of index bleed filtered results
    final = pd.DataFrame(cleaned, columns=header)
    final.set_index(OTUhead, inplace=True)

    if args.drop:  #if user has passed samples to drop, do it here, subtract drop list from Header
        amptklib.log.info("Dropping %i samples from table: %s" %
                          (len(args.drop), ', '.join(args.drop)))

        colsdrop = []
        for x in args.drop:
            if x in header:
                colsdrop.append(x)
        #now drop those columns
        final.drop(colsdrop, axis=1, inplace=True)

    if args.subtract != 'auto':
        subtract_num = int(args.subtract)
    else:
        try:
            subtract_num = int(subtract_num)
            amptklib.log.info("Auto subtract filter set to %i" % subtract_num)
        except NameError:
            subtract_num = 0
            amptklib.log.info(
                "Error: to use 'auto' subtract feature, provide a sample name to -b,--mock_barcode."
            )
    if subtract_num != 0:
        amptklib.log.info("Subtracting %i from OTU table" % subtract_num)
        sub = final.subtract(subtract_num)
        sub[sub < 0] = 0  #if negative, change to zero
        sub = sub.loc[~(sub == 0).all(axis=1)]
        sub = sub.astype(int)
        if otuDict:
            sub['Taxonomy'] = pd.Series(otuDict)
            sub.to_csv(subtract_table, sep=delim)
            del sub['Taxonomy']
        else:
            sub.to_csv(subtract_table, sep=delim)
        otus_if_sub = sub[sub > 0].count(axis=0, numeric_only=True)
        final = sub.astype(int)
    otus_per_sample = final[final > 0].count(axis=0, numeric_only=True)
    stats = pd.concat([fs, otus_per_sample_original, otus_per_sample], axis=1)
    stats.columns = ['reads per sample', 'original OTUs', 'final OTUs']
    stats.fillna(0, inplace=True)
    stats = stats.astype(int)
    if args.show_stats:
        print(stats.to_string())
    stats.to_csv(stats_table, sep=delim)
    #after all filtering, get list of OTUs in mock barcode
    if args.mock_barcode:
        mocks = final[args.mock_barcode]
        mocks = mocks.loc[~(mocks == 0)].astype(int)
        totalmismatches = 0
        totallength = 0
        chimera_count = 0
        variant_count = 0
        for otu in mocks.index:
            count = mocks[otu]
            if 'suspect_mock' in otu:
                if 'chimera' in otu:
                    chimera_count += 1
                if 'variant' in otu:
                    variant_count += 1
                otu = otu.split('_', 1)[0]
            else:
                otu = otu.split('_', -1)[-1]
            otu_length = SeqLength.get(otu)
            countlen = otu_length * count
            totallength += countlen
            if otu in errorrate:
                otu_diffs = errorrate.get(otu)[1]
                totaldiffs = otu_diffs * count
                totalmismatches += totaldiffs
            else:
                totalmismatches += countlen
        e_rate = totalmismatches / float(totallength) * 100
        amptklib.log.info(args.mock_barcode + ' sample has ' +
                          '{0:,}'.format(len(mocks)) + ' OTUS out of ' +
                          '{0:,}'.format(mock_ref_count) + ' expected; ' +
                          '{0:,}'.format(variant_count) + ' mock variants; ' +
                          '{0:,}'.format(chimera_count) +
                          ' mock chimeras; Error rate: ' +
                          '{0:.3f}%'.format(e_rate))

    if not args.keep_mock:
        try:
            final.drop(args.mock_barcode, axis=1, inplace=True)
        except:
            pass

    #drop OTUs that are now zeros through whole table
    final = final.loc[~(final == 0).all(axis=1)]
    final = final.astype(int)

    #output filtered normalized table
    if otuDict:
        final['Taxonomy'] = pd.Series(otuDict)
        final.to_csv(filtered_table, sep=delim)
        del final['Taxonomy']
    else:
        final.to_csv(filtered_table, sep=delim)

    #convert to binary
    final[final > 0] = 1

    #apply min_sample_otu here (most stringent filter, not sure I would use this unless you know what you are doing)
    los = final.sum(axis=1)
    fotus = los[los >= args.min_samples_otu]
    keep = fotus.index
    final2 = pd.DataFrame(final, index=keep)
    diff = len(final.index) - len(keep)
    if diff > 0:
        amptklib.log.info(
            'Dropped {:,} OTUs found in fewer than {:,} samples'.format(
                diff, args.min_samples_otu))

    #drop samples that don't have any OTUs after filtering
    final3 = final2.loc[:, (final2 != 0).any(axis=0)]
    final3 = final3.astype(int)

    #get the actual read counts from binary table
    merge = {}
    for index, row in final3.items():
        merge[index] = []
        for i in range(0, len(row)):
            if row[i] == 0:
                merge[index].append(row[i])
            else:
                merge[index].append(SortedTable[index][row.index[i]])

    FiltTable = pd.DataFrame(merge, index=list(final3.index))
    FiltTable.index.name = '#OTU ID'

    #order the filtered table
    #sort the table
    FiltTable2 = FiltTable.reindex(index=natsorted(FiltTable.index))
    if not args.col_order:
        FiltTable = FiltTable2.reindex(columns=natsorted(FiltTable2.columns))
    else:
        col_headers = args.col_order
        #check if all names in headers or not
        for i in col_headers:
            if not i in FiltTable2.columns.values:
                col_headers.remove(i)
        FiltTable = FiltTable2.reindex(columns=col_headers)

    #check for negative samples and how many OTUs are in these samples
    #if found, filter the OTUs and alert user to rebuild OTU table, I could do this automatically, but would then require
    #there to be reads passed to this script which seems stupid.  Just deleting the OTUs is probably not okay....
    if args.negatives:
        if len(args.negatives
               ) > 1:  #if greater than 1 then assuming list of sample names
            Neg = args.negatives
        else:
            if os.path.isfile(
                    args.negatives[0]):  #check if it is a file or not
                Neg = []
                with open(args.negatives[0], 'r') as negfile:
                    for line in negfile:
                        line = line.replace('\n', '')
                        Neg.append(line)
            else:
                Neg = args.negatives
        #Now slice the final OTU table, check if values are valid
        NotFound = []
        for i in Neg:
            if not i in FiltTable.columns.values:
                Neg.remove(i)
                NotFound.append(i)
        if len(NotFound) > 0:
            amptklib.log.info('Samples not found: %s' % ' '.join(NotFound))
        #slice table
        NegTable = FiltTable.reindex(columns=Neg)
        #drop those that are zeros through all samples, just pull out OTUs found in the negative samples
        NegTable = NegTable.loc[~(NegTable == 0).all(axis=1)]
        NegOTUs = list(NegTable.index)
        #now make sure you aren't dropping mock OTUs as you want to keep those for filtering new OTU table
        NegOTUs = [item for item in NegOTUs if item not in mock]
    else:
        NegOTUs = []

    #check if negative OTUs exist, if so, then output updated OTUs and instructions on creating new OTU table
    if len(NegOTUs) > 0:
        amptklib.log.info("%i OTUs are potentially contamination" %
                          len(NegOTUs))
        otu_clean = base + '.cleaned.otus.fa'
        with open(otu_clean, 'w') as otu_update:
            with open(args.fasta, "rU") as myfasta:
                for rec in SeqIO.parse(myfasta, 'fasta'):
                    if not rec.id in NegOTUs:
                        SeqIO.write(rec, otu_update, 'fasta')
        amptklib.log.info("Cleaned OTUs saved to: %s" % otu_clean)
        amptklib.log.info(
            "Generate a new OTU table like so:\namptk remove -i %s --format fasta -l %s -o %s\nvsearch --usearch_global %s --db %s --strand plus --id 0.97 --otutabout newOTU.table.txt\n"
            % (base + '.demux.fq', ' '.join(Neg), base + '.cleaned.fa',
               base + '.cleaned.fa', otu_clean))

    else:  #proceed with rest of script
        #output final table
        if otuDict:
            FiltTable['Taxonomy'] = pd.Series(otuDict)
            FiltTable.to_csv(final_table, sep=delim)
            del FiltTable['Taxonomy']
        else:
            FiltTable.to_csv(final_table, sep=delim)
        finalSamples = FiltTable.columns.values.tolist()
        if 'Taxonomy' in finalSamples:
            numFinalSamples = len(finalSamples) - 1
        else:
            numFinalSamples = len(finalSamples)
        amptklib.log.info(
            'Filtered OTU table contains {:,} samples, {:,} OTUs, and {:,} read counts'
            .format(numFinalSamples, len(FiltTable.index),
                    FiltTable.values.sum()))
        if numFinalSamples < len(df.columns.values.tolist()):
            diffSamples = [
                item for item in headers
                if item not in FiltTable.columns.values.tolist()
            ]
            amptklib.log.info('Samples dropped: %s' % (','.join(diffSamples)))
        #output binary table
        if otuDict:
            final3['Taxonomy'] = pd.Series(otuDict)
            final3.to_csv(final_binary_table, sep=delim)
        else:
            final3.to_csv(final_binary_table, sep=delim)

        #generate final OTU list for taxonomy
        amptklib.log.info("Finding valid OTUs")
        otu_new = base + '.filtered.otus.fa'
        with open(otu_new, 'w') as otu_update:
            with open(args.fasta, "rU") as myfasta:
                for rec in SeqIO.parse(myfasta, 'fasta'):
                    if ';' in rec.id:
                        rec.id = rec.id.split(';', 1)[0]
                    if args.mock_barcode:
                        #map new names of mock
                        if rec.id in annotate_dict:
                            newname = annotate_dict.get(rec.id)
                            rec.id = newname
                            rec.description = ''
                    if rec.id in final3.index:
                        if rec.id in OTU_tax:
                            otu_update.write(
                                '>%s;%s\n%s\n' %
                                (rec.id, OTU_tax.get(rec.id), rec.seq))
                        else:
                            otu_update.write('>%s\n%s\n' % (rec.id, rec.seq))

        #tell user what output files are
        print("-------------------------------------------------------")
        print("OTU Table filtering finished")
        print("-------------------------------------------------------")
        print("OTU Table Stats:      %s" % stats_table)
        print("Sorted OTU table:     %s" % sorted_table)
        if not args.debug:
            for i in [
                    normal_table_pct, normal_table_nums, subtract_table,
                    mock_out, FastaCounts
            ]:
                amptklib.removefile(i)
        else:
            print("Normalized (pct):     %s" % normal_table_pct)
            print("Normalized (10k):     %s" % normal_table_nums)
            if args.subtract != 0:
                print("Subtracted table:     %s" % subtract_table)
        print("Normalized/filter:    %s" % filtered_table)
        print("Final Binary table:   %s" % final_binary_table)
        print("Final OTU table:      %s" % final_table)
        print("Filtered OTUs:        %s" % otu_new)
        print("-------------------------------------------------------")

        if 'darwin' in sys.platform:
            print(colr.WARN + "\nExample of next cmd:" + colr.END +
                  " amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" %
                  (otu_new, final_table))
        else:
            print(
                "\nExample of next cmd: amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n"
                % (otu_new, final_table))
Exemple #2
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-assign_taxonomy.py',
        usage="%(prog)s [options] -f <FASTA File>",
        description='''assign taxonomy to OTUs''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--otu_table',
                        dest="otu_table",
                        help='Append Taxonomy to OTU table')
    parser.add_argument('-f', '--fasta', required=True, help='FASTA input')
    parser.add_argument('-o', '--out', help='Output file (FASTA)')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument(
        '--method',
        default='hybrid',
        choices=['utax', 'usearch', 'sintax', 'hybrid', 'rdp', 'blast'],
        help='Taxonomy method')
    parser.add_argument(
        '-d',
        '--db',
        help='Pre-installed Databases: [ITS,ITS1,ITS2,16S,LSU,COI]')
    parser.add_argument(
        '-t',
        '--taxonomy',
        help='Incorporate taxonomy calculated elsewhere, 2 column file')
    parser.add_argument('--fasta_db',
                        help='Alternative database of fasta sequences')
    parser.add_argument('--add2db',
                        help='Custom FASTA database to add to DB on the fly')
    parser.add_argument('--utax_db', help='UTAX Reference Database')
    parser.add_argument('--utax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='UTAX confidence value threshold.')
    parser.add_argument('--usearch_db', help='USEARCH Reference Database')
    parser.add_argument('--usearch_cutoff',
                        default=0.7,
                        type=restricted_float,
                        help='USEARCH percent ID threshold.')
    parser.add_argument(
        '-r',
        '--rdp',
        dest='rdp',
        default='/Users/jon/scripts/rdp_classifier_2.10.1/dist/classifier.jar',
        help='Path to RDP Classifier')
    parser.add_argument('--rdp_db',
                        dest='rdp_tax',
                        default='fungalits_unite',
                        choices=[
                            '16srrna', 'fungallsu', 'fungalits_warcup',
                            'fungalits_unite'
                        ],
                        help='Training set for RDP Classifier')
    parser.add_argument('--rdp_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='RDP confidence value threshold')
    parser.add_argument('--local_blast', help='Path to local Blast DB')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH8 EXE')
    parser.add_argument('--tax_filter',
                        help='Retain only OTUs with match in OTU table')
    parser.add_argument('--sintax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='SINTAX threshold.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    if not args.out:
        #get base name of files
        if 'filtered' in args.fasta:
            base = args.fasta.split(".filtered")[0]
        elif 'otu' in args.fasta:
            base = args.fasta.split('.otu')[0]
        else:
            base = args.fasta.split('.fa')[0]
    else:
        base = args.out

    #remove logfile if exists
    log_name = base + '.amptk-taxonomy.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #Setup DB locations and names, etc
    DBdir = os.path.join(parentdir, 'DB')
    DataBase = {
        'ITS1': (os.path.join(DBdir,
                              'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb'),
                 os.path.join(DBdir, 'ITS_SINTAX.udb')),
        'ITS2': (os.path.join(DBdir,
                              'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb'),
                 os.path.join(DBdir, 'ITS_SINTAX.udb')),
        'ITS': (os.path.join(DBdir,
                             'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb'),
                os.path.join(DBdir, 'ITS_SINTAX.udb')),
        '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb'),
                os.path.join(DBdir, '16S_SINTAX.udb')),
        'LSU': (os.path.join(DBdir,
                             'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb'),
                os.path.join(DBdir, 'LSU_SINTAX.udb')),
        'COI': (os.path.join(DBdir,
                             'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'),
                os.path.join(DBdir, 'COI_SINTAX.udb'))
    }

    #get DB names up front
    if args.db in DataBase:
        utax_db = DataBase.get(args.db)[1]
        usearch_db = DataBase.get(args.db)[0]
        sintax_db = DataBase.get(args.db)[2]
        if not utax_db:
            utax_db = args.utax_db
        if not usearch_db:
            usearch_db = args.usearch_db
    else:
        utax_db = args.utax_db
        usearch_db = args.usearch_db
        if args.fasta_db:
            sintax_db = args.fasta_db
        else:
            sintax_db = args.usearch_db

    if args.method in ['hybrid', 'usearch', 'utax']:
        if not utax_db and not usearch_db and not args.fasta_db:
            amptklib.log.error(
                "You have not selected a database, need either --db, --utax_db, --usearch_db, or --fasta_db"
            )
            sys.exit(1)
        else:  #check that the DB exists
            if args.method == 'usearch' and usearch_db:
                if not amptklib.checkfile(usearch_db):
                    amptklib.log.error(
                        'USEARCH DB not found: {:}'.format(usearch_db))
                    amptklib.log.derror(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)
            if args.method == 'sintax' and sintax_db:
                if not amptklib.checkfile(sintax_db):
                    amptklib.log.error(
                        'SINTAX DB not found: {:}'.format(sintax_db))
                    amptklib.log.derror(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)
            if args.method == 'utax' and utax_db:
                if not amptklib.checkfile(utax_db):
                    amptklib.log.error(
                        'UTAX DB not found: {:}'.format(utax_db))
                    amptklib.log.error(
                        'Use `amptk install` to install pre-formatted databases or `amptk database` to create custom DB'
                    )
                    sys.exit(1)

    custom_db = None
    if args.add2db:  #means user wants to add sequences to the usearch database on the so will need to rebuild database
        custom_db = base + '.custom_database.fa'
        if amptklib.checkfile(custom_db):
            amptklib.SafeRemove(custom_db)
        if args.db:  #this means that the fasta files need to be extracted
            amptklib.log.info("Adding {:} to the {:} database".format(
                os.path.basename(args.add2db), os.path.basename(usearch_db)))
            cmd = ['vsearch', '--udb2fasta', usearch_db, '--output', custom_db]
            amptklib.runSubprocess(cmd, amptklib.log)
            with open(custom_db, 'a') as outfile:
                with open(args.add2db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)
        elif args.fasta_db:
            amptklib.log.info("Adding {:} to the {:} database".format(
                os.path.basename(args.add2db),
                os.path.basename(args.fasta_db)))
            with open(custom_db, 'w') as outfile:
                with open(args.fasta_db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)
                with open(args.add2db, 'r') as infile:
                    shutil.copyfileobj(infile, outfile)

    #Count records
    amptklib.log.info("Loading FASTA Records")
    total = amptklib.countfasta(args.fasta)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs')

    #declare output files/variables here
    blast_out = base + '.blast.txt'
    rdp_out = base + '.rdp.txt'
    utax_out = base + '.usearch.txt'
    usearch_out = base + '.usearch.txt'
    sintax_out = base + '.sintax.txt'
    otuDict = {}

    if not args.taxonomy:
        #start with less common uses, i.e. Blast, rdp
        if args.method == 'blast':
            #check if command line blast installed
            if not amptklib.which('blastn'):
                amptklib.log.error("BLASTN not found in your PATH, exiting.")
                sys.exit(1)

            #now run blast remotely using NCBI nt database
            outformat = "6 qseqid sseqid pident stitle"
            if args.local_blast:
                #get number of cpus
                amptklib.log.info("Running local BLAST using db: %s" %
                                  args.local_blast)
                cmd = [
                    'blastn', '-num_threads',
                    str(cpus), '-query', args.fasta, '-db',
                    os.path.abspath(args.local_blast), '-max_target_seqs', '1',
                    '-outfmt', outformat, '-out', blast_out
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                amptklib.log.info(
                    "Running BLASTN using NCBI remote nt database, this may take awhile"
                )
                cmd = [
                    'blastn', '-query', args.fasta, '-db', 'nt', '-remote',
                    '-max_target_seqs', '1', '-outfmt', outformat, '-out',
                    blast_out
                ]
                amptklib.runSubprocess(cmd, amptklib.log)

            #load results and reformat
            new = []
            f = csv.reader(open(blast_out), delimiter=str('\t'))
            for col in f:
                query = col[0]
                gbID = col[1].split("|")[3]
                pident = col[2]
                name = col[3]
                tax = gbID + ";" + name + " (" + pident + ")"
                line = [query, tax]
                new.append(line)
            otuDict = dict(new)
        elif args.method == 'rdp':
            #check that classifier is installed
            try:
                rdp_test = subprocess.Popen(
                    ['java', '-Xmx2000m', '-jar', args.rdp, 'classify'],
                    stdout=subprocess.PIPE).communicate()[0].rstrip()
            except OSError:
                amptklib.log.error("%s not found in your PATH, exiting." %
                                   args.rdp)
                sys.exit(1)

            #RDP database
            amptklib.log.info("Using RDP classifier %s training set" %
                              args.rdp_tax)

            #run RDP
            cmd = [
                'java', '-Xmx2000m', '-jar', args.rdp, 'classify', '-g',
                args.rdp_tax, '-o', rdp_out, '-f', 'fixrank', args.fasta
            ]
            amptklib.runSubprocess(cmd, amptklib.log)

            #load in results and put into dictionary
            new = []
            removal = ["unidentified", "Incertae", "uncultured", "incertae"]
            remove_exp = [re.compile(x) for x in removal]
            f = csv.reader(open(rdp_out), delimiter=str('\t'))
            for col in f:
                if float(col[19]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11] + ",f:" + col[14] + ",g:" + col[17]
                elif float(col[16]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11] + ",f:" + col[14]
                elif float(col[13]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[
                        8] + ",o:" + col[11]
                elif float(col[10]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5] + ",c:" + col[8]
                elif float(col[7]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2] + ",p:" + col[5]
                elif float(col[4]) > args.rdp_cutoff:
                    tax = "RDP;k:" + col[2]
                else:
                    tax = "RDP;k:unclassified"
                tax_split = tax.split(",")
                tax = [
                    s for s in tax_split
                    if not any(re.search(s) for re in remove_exp)
                ]
                tax = ",".join(tax)
                line = [col[0], tax]
                new.append(line)
            otuDict = dict(new)
        else:
            #check status of USEARCH DB and run
            if args.method in ['hybrid', 'usearch']:
                if args.fasta_db:
                    #now run through usearch global
                    amptklib.log.info(
                        "Global alignment OTUs with usearch_global (VSEARCH) against {:}"
                        .format(os.path.basename(args.fasta_db)))
                    cmd = [
                        'vsearch', '--usearch_global', args.fasta, '--db',
                        os.path.abspath(args.fasta_db), '--userout',
                        usearch_out, '--id',
                        str(args.usearch_cutoff), '--strand', 'both',
                        '--output_no_hits', '--maxaccepts', '0',
                        '--top_hits_only', '--userfields', 'query+target+id',
                        '--notrunclabels', '--threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                elif custom_db:
                    #now run through usearch global
                    amptklib.log.info(
                        "Global alignment OTUs with usearch_global (VSEARCH) against custom DB"
                    )
                    cmd = [
                        'vsearch', '--usearch_global', args.fasta, '--db',
                        os.path.abspath(custom_db), '--userout', usearch_out,
                        '--id',
                        str(args.usearch_cutoff), '--strand', 'both',
                        '--output_no_hits', '--maxaccepts', '0',
                        '--top_hits_only', '--userfields', 'query+target+id',
                        '--notrunclabels', '--threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                else:
                    if usearch_db:
                        amptklib.log.info(
                            "Global alignment OTUs with usearch_global (VSEARCH) against {:}"
                            .format(os.path.basename(usearch_db)))
                        cmd = [
                            'vsearch', '--usearch_global', args.fasta, '--db',
                            os.path.abspath(usearch_db), '--userout',
                            usearch_out, '--id',
                            str(args.usearch_cutoff), '--strand', 'both',
                            '--output_no_hits', '--maxaccepts', '0',
                            '--top_hits_only', '--userfields',
                            'query+target+id', '--notrunclabels', '--threads',
                            str(cpus)
                        ]
                        amptklib.runSubprocess(cmd, amptklib.log)

            if args.method in ['hybrid', 'utax']:
                if utax_db:
                    #now run through UTAX
                    utax_out = base + '.utax.txt'
                    amptklib.log.info("Classifying OTUs with UTAX (USEARCH)")
                    cutoff = str(args.utax_cutoff)
                    cmd = [
                        usearch, '-utax', args.fasta, '-db', utax_db,
                        '-utaxout', utax_out, '-utax_cutoff', cutoff,
                        '-strand', 'plus', '-notrunclabels', '-threads',
                        str(cpus)
                    ]
                    amptklib.runSubprocess(cmd, amptklib.log)
                else:
                    amptklib.log.error("UTAX DB %s not found, skipping" %
                                       utax_db)

            if args.method in ['hybrid', 'sintax']:
                if args.fasta_db:  #if you pass fasta file here, over ride any auto detection
                    sintax_db = args.fasta_db
                #now run sintax
                amptklib.log.info("Classifying OTUs with SINTAX (USEARCH)")
                cmd = [
                    usearch, '-sintax', args.fasta, '-db',
                    os.path.abspath(sintax_db), '-tabbedout', sintax_out,
                    '-sintax_cutoff',
                    str(args.sintax_cutoff), '-strand', 'both', '-threads',
                    str(cpus)
                ]
                amptklib.runSubprocess(cmd, amptklib.log)

            #now process results, load into dictionary - slightly different depending on which classification was run.
            if args.method == 'hybrid':
                #run upgraded method, first load dictionaries with resuls
                if amptklib.checkfile(utax_out):
                    utaxDict = amptklib.classifier2dict(
                        utax_out, args.utax_cutoff)
                    amptklib.log.debug(
                        'UTAX results parsed, resulting in {:,} taxonomy predictions'
                        .format(len(utaxDict)))
                else:
                    amptklib.log.info('UTAX results empty')
                    utaxDict = {}
                if amptklib.checkfile(sintax_out):
                    sintaxDict = amptklib.classifier2dict(
                        sintax_out, args.sintax_cutoff)
                    amptklib.log.debug(
                        'SINTAX results parsed, resulting in {:,} taxonomy predictions'
                        .format(len(sintaxDict)))
                else:
                    amptklib.log.info('SINTAX results empty')
                    sintaxDict = {}
                usearchDict = amptklib.usearchglobal2dict(usearch_out)
                amptklib.log.debug(
                    'Global alignment results parsed, resulting in {:,} taxonomy predictions'
                    .format(len(usearchDict)))
                otuList = natsorted(list(usearchDict.keys()))
                #first compare classifier results, getting better of the two
                bestClassify = amptklib.bestclassifier(utaxDict, sintaxDict,
                                                       otuList)
                #now get best taxonomy by comparing to global alignment results
                otuDict = amptklib.bestTaxonomy(usearchDict, bestClassify)
                amptklib.log.debug(
                    'Combined OTU taxonomy dictionary contains {:,} taxonomy predictions'
                    .format(len(otuDict)))
                if len(otuDict) < 1:
                    amptklib.log.info('Parsing taxonomy failed -- see logfile')
                    sys.exit(1)

            elif args.method == 'utax' and amptklib.checkfile(utax_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug("Loading UTAX results into dictionary")
                with open(utax_out, 'r') as infile:
                    reader = csv.reader(infile, delimiter=str("\t"))
                    otuDict = {rows[0]: 'UTAX;' + rows[2] for rows in reader}

            elif args.method == 'usearch' and amptklib.checkfile(usearch_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug(
                    "Loading Global Alignment results into dictionary")
                otuDict = {}
                usearchDict = amptklib.usearchglobal2dict(usearch_out)
                for k, v in natsorted(list(usearchDict.items())):
                    pident = float(v[0]) * 100
                    pident = "{0:.1f}".format(pident)
                    ID = v[1]
                    tax = ','.join(v[-1])
                    LCA = v[2]
                    if LCA == '':
                        fulltax = 'GS|' + pident + '|' + ID + ';' + tax
                    else:
                        fulltax = 'GSL|' + pident + '|' + ID + ';' + tax
                    otuDict[k] = fulltax

            elif args.method == 'sintax' and amptklib.checkfile(sintax_out):
                #load results into dictionary for appending to OTU table
                amptklib.log.debug("Loading SINTAX results into dictionary")
                with open(sintax_out, 'r') as infile:
                    reader = csv.reader(infile, delimiter=(str("\t")))
                    otuDict = {rows[0]: 'SINTAX;' + rows[3] for rows in reader}
    else:
        #you have supplied a two column taxonomy file, parse and build otuDict
        amptklib.log.debug("Loading custom Taxonomy into dictionary")
        with open(args.taxonomy, 'r') as infile:
            reader = csv.reader(infile, delimiter=str("\t"))
            otuDict = {rows[0]: rows[1] for rows in reader}

    #now format results
    if args.otu_table:
        #check if otu_table variable is empty, then load in otu table
        amptklib.log.info("Appending taxonomy to OTU table and OTUs")
        taxTable = base + '.otu_table.taxonomy.txt'
        tmpTable = base + '.otu_table.tmp'

        #append to OTU table
        counts = 0
        with open(taxTable, 'w') as outTable:
            with open(args.otu_table, 'r') as inTable:
                #guess the delimiter format
                firstline = inTable.readline()
                dialect = amptklib.guess_csv_dialect(firstline)
                inTable.seek(0)
                #parse OTU table
                reader = csv.reader(inTable, dialect)
                for line in reader:
                    if line[0].startswith(("#OTU", "OTUId")):
                        line.append('Taxonomy')
                    else:
                        tax = otuDict.get(line[0]) or "No Hit"
                        line.append(tax)
                    if args.tax_filter and not args.method == 'blast':
                        if line[0].startswith(("#OTU", "OTUId")):
                            join_line = ('\t'.join(str(x) for x in line))
                        else:
                            if args.tax_filter in line[-1]:
                                join_line = ('\t'.join(str(x) for x in line))
                                counts += 1
                            else:
                                continue
                    else:
                        join_line = ('\t'.join(str(x) for x in line))
                        counts += 1
                    outTable.write("%s\n" % join_line)

        if args.tax_filter:
            if args.method == 'blast':
                amptklib.log.info(
                    "Blast is incompatible with --tax_filter, use a different method"
                )
                tmpTable = args.otu_table
            else:
                nonfungal = total - counts
                amptklib.log.info(
                    "Found %i OTUs not matching %s, writing %i %s hits to taxonomy OTU table"
                    % (nonfungal, args.tax_filter, counts, args.tax_filter))
                #need to create a filtered table without taxonomy for BIOM output
                with open(tmpTable, 'w') as output:
                    with open(taxTable, 'r') as input:
                        firstline = input.readline()
                        dialect = amptklib.guess_csv_dialect(firstline)
                        input.seek(0)
                        #parse OTU table
                        reader = csv.reader(input, dialect)
                        for line in reader:
                            del line[-1]
                            join_line = '\t'.join(str(x) for x in line)
                            output.write("%s\n" % join_line)
        else:
            tmpTable = args.otu_table

    #append to OTUs
    otuTax = base + '.otus.taxonomy.fa'
    with open(otuTax, 'w') as output:
        with open(args.fasta, 'r') as input:
            SeqRecords = SeqIO.parse(input, 'fasta')
            for rec in SeqRecords:
                tax = otuDict.get(rec.id) or "No hit"
                rec.description = tax
                SeqIO.write(rec, output, 'fasta')

    if not args.taxonomy:
        #output final taxonomy in two-column format, followed by the hits for usearch/sintax/utax if hybrid is used.
        taxFinal = base + '.taxonomy.txt'
        with open(taxFinal, 'w') as finaltax:
            if args.method == 'hybrid':
                finaltax.write('#OTUID\ttaxonomy\tUSEARCH\tSINTAX\tUTAX\n')
                for k, v in natsorted(list(otuDict.items())):
                    if k in usearchDict:
                        usearchResult = usearchDict.get(k)
                        usearchResult = ','.join(usearchResult[-1])
                    else:
                        usearchResult = 'No hit'
                    if k in sintaxDict:
                        sintaxResult = sintaxDict.get(k)
                        sintaxResult = ','.join(sintaxResult[-1])
                    else:
                        sintaxResult = 'No hit'
                    if k in utaxDict:
                        utaxResult = utaxDict.get(k)
                        utaxResult = ','.join(utaxResult[-1])
                    else:
                        utaxResult = 'No hit'
                    finaltax.write('{:}\t{:}\t{:}\t{:}\t{:}\n'.format(
                        k, v, usearchResult, sintaxResult, utaxResult))
            else:
                finaltax.write('#OTUID\ttaxonomy\n')
                for k, v in natsorted(list(otuDict.items())):
                    finaltax.write('%s\t%s\n' % (k, v))
    else:
        taxFinal = args.taxonomy
    #convert taxonomy to qiime format for biom
    qiimeTax = None
    if not args.method == 'blast':
        qiimeTax = base + '.qiime.taxonomy.txt'
        amptklib.utax2qiime(taxFinal, qiimeTax)
    else:
        amptklib.log.error(
            "Blast taxonomy is not compatible with BIOM output, use a different method"
        )

    #create OTU phylogeny for downstream processes
    amptklib.log.info("Generating phylogenetic tree")
    tree_out = base + '.tree.phy'
    cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out]
    amptklib.runSubprocess(cmd, amptklib.log)

    #print some summary file locations
    amptklib.log.info("Taxonomy finished: %s" % taxFinal)
    if args.otu_table and not args.method == 'blast':
        amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable)
        #output final OTU table in Biom v1.0 (i.e. json format if biom installed)
        outBiom = base + '.biom'
        if amptklib.which('biom'):
            amptklib.removefile(outBiom)
            cmd = [
                'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp',
                '--table-type', "OTU table", '--to-json'
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            if args.mapping_file:
                mapSamples = []
                repeatSamples = []
                with open(args.mapping_file, 'r') as mapin:
                    for line in mapin:
                        line = line.rstrip()
                        if line.startswith('#'):
                            continue
                        sampleID = line.split('\t')[0]
                        if not sampleID in mapSamples:
                            mapSamples.append(sampleID)
                        else:
                            repeatSamples.append(sampleID)
                otuSamples = []
                with open(tmpTable, 'r') as otuin:
                    for line in otuin:
                        line = line.rstrip()
                        if line.startswith('#'):
                            otuSamples = line.split('\t')[1:]
                missingMap = []
                for otu in otuSamples:
                    if not otu in mapSamples:
                        missingMap.append(otu)
                if len(missingMap) > 0:
                    amptklib.log.error(
                        "%s are missing from mapping file (metadata), skipping biom file creation"
                        % ', '.join(missingMap))
                elif len(repeatSamples) > 0:
                    amptklib.log.error(
                        '%s duplicate sample IDs in mapping file, skipping biom file creation'
                        % ', '.join(repeatSamples))
                else:
                    if qiimeTax:
                        cmd = [
                            'biom', 'add-metadata', '-i', outBiom + '.tmp',
                            '-o', outBiom, '--observation-metadata-fp',
                            qiimeTax, '-m', args.mapping_file,
                            '--sc-separated', 'taxonomy', '--output-as-json'
                        ]
                    else:
                        cmd = [
                            'biom', 'add-metadata', '-i', outBiom + '.tmp',
                            '-o', outBiom, '-m', args.mapping_file,
                            '--output-as-json'
                        ]
                    amptklib.runSubprocess(cmd, amptklib.log)
            else:
                cmd = [
                    'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o',
                    outBiom, '--observation-metadata-fp', qiimeTax,
                    '--sc-separated', 'taxonomy', '--output-as-json'
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            amptklib.removefile(outBiom + '.tmp')
            amptklib.log.info("BIOM OTU table created: %s" % outBiom)
        else:
            amptklib.log.info(
                "biom program not installed, install via `pip install biom-format` or `conda install biom-format`"
            )
    amptklib.log.info("OTUs with taxonomy: %s" % otuTax)
    amptklib.log.info("OTU phylogeny: %s" % tree_out)

    #clean up intermediate files
    if not args.debug:
        for i in [
                utax_out, usearch_out, sintax_out, qiimeTax,
                base + '.otu_table.tmp'
        ]:
            if i:
                amptklib.removefile(i)
    print("-------------------------------------------------------")
Exemple #3
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-OTU_cluster_ref.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UPARSE OTU clustering.
		Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="OTU Clustering Percent")
    parser.add_argument('--id', default='97', help="Threshold for alignment")
    parser.add_argument('-m',
                        '--minsize',
                        default='2',
                        help='Min identical seqs to process')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument(
        '-d',
        '--db',
        required=True,
        help='Reference Database [ITS,ITS1,ITS2,16S,LSU,COI,custom]')
    parser.add_argument('--utax_db', help='UTAX Reference Database')
    parser.add_argument('--utax_cutoff',
                        default=0.8,
                        type=restricted_float,
                        help='UTAX confidence value threshold.')
    parser.add_argument('--utax_level',
                        default='k',
                        choices=['k', 'p', 'c', 'o', 'f', 'g', 's'],
                        help='UTAX classification level to retain')
    parser.add_argument('--mock',
                        default='synmock',
                        help='Spike-in mock community (fasta)')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--closed_ref_only',
                        action='store_true',
                        help='Only run closed reference clustering')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    taxonomyLookup = {
        'k': 'Kingdom',
        'p': 'Phylum',
        'c': 'Class',
        'o': 'Order',
        'f': 'Family',
        'g': 'Genus',
        's': 'Species'
    }

    #remove logfile if exists
    log_name = base + '.amptk-cluster_ref.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Setup DB locations and names, etc
    DBdir = os.path.join(parentdir, 'DB')
    DataBase = {
        'ITS1':
        (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb')),
        'ITS2':
        (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb')),
        'ITS': (os.path.join(DBdir,
                             'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb')),
        '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir,
                                                             '16S.udb')),
        'LSU': (os.path.join(DBdir,
                             'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb')),
        'COI': (os.path.join(DBdir,
                             'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb'))
    }

    #setup refDB
    amptklib.log.info("Checking Reference Database")
    if args.db in DataBase:
        #need to write to fasta from vsearch UDB
        DB = os.path.join(tmp, args.db + '.extracted.fa')
        cmd = [
            'vsearch', '--udb2fasta',
            DataBase.get(args.db)[0], '--output', DB
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
    else:
        DB = os.path.abspath(args.db)
    refDB = os.path.join(tmp, 'reference_DB.fa')
    if args.mock:
        if args.mock == 'synmock':
            mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa')
        else:
            mock = os.path.abspath(args.mock)
    seen = []
    with open(refDB, 'w') as output:
        if args.mock:
            with open(mock) as input1:
                for rec in SeqIO.parse(input1, 'fasta'):
                    if not rec.id in seen:
                        SeqIO.write(rec, output, 'fasta')
                    else:
                        amptklib.log.error(
                            "Duplicate ID's in Ref DB: %s, exiting" % rec.id)
                        sys.exit(1)
        with open(DB) as input2:
            for rec in SeqIO.parse(input2, 'fasta'):
                if not rec.id in seen:
                    SeqIO.write(rec, output, 'fasta')
                else:
                    amptklib.log.error(
                        "Duplicate ID's in Ref DB: %s, exiting" % rec.id)
                    sys.exit(1)

    #get utax_database
    if args.db in DataBase:
        utaxDB = DataBase.get(args.db)[1]
    else:
        if not args.closed_ref_only:
            if args.utax_db:
                utaxDB = os.path.abspath(args.utax_db)
            else:
                amptklib.log.error(
                    "%s not pre-installed DB, must then also specify valid UTAX database via --utax_db"
                    % args.db)
                sys.exit(1)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    qtrimtotal = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output',
        derep_out, '--threads',
        str(cpus), '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run sort by size
    sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa')
    amptklib.log.info(
        "Sorting reads by size: removing reads seen less than %s times" %
        args.minsize)
    cmd = [
        'vsearch', '--sortbysize', derep_out, '--minsize', args.minsize,
        '--output', sort_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(sort_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #chimera detection
    #first run through de novo chimera detection
    amptklib.log.info("De novo chimera detection (VSEARCH)")
    chimera_out = os.path.join(tmp,
                               base + '.EE' + args.maxee + '.chimera_check.fa')
    cmd = [
        'vsearch', '--uchime_denovo', sort_out, '--relabel', 'Seq',
        '--sizeout', '--nonchimeras', chimera_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(chimera_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run uchime_ref
    uchime_out = os.path.join(tmp,
                              base + '.EE' + args.maxee + '.uchime.otus.fa')
    #now run chimera filtering if all checks out
    amptklib.log.info("Chimera Filtering (VSEARCH)")
    cmd = [
        'vsearch', '--mindiv', '1.0', '--uchime_ref', chimera_out, '--db',
        refDB, '--sizeout', '--nonchimeras', uchime_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(uchime_out)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs passed')

    #now run usearch_global versus reference database
    align_out = os.path.join(tmp, base + '.align.uc')
    pident = int(args.id) * 0.01
    amptklib.log.info(
        "Reference Clustering using Global Alignment, %s%% identity" % args.id)
    cmd = [
        'vsearch', '--usearch_global', uchime_out, '--db', refDB, '--id',
        str(pident), '--output_no_hits', '--top_hits_only', '--notrunclabels',
        '--uc', align_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #parse results
    ref_results = {}
    nohits = []
    with open(align_out, 'r') as alignment:
        for line in alignment:
            line = line.replace('\n', '')
            col = line.split('\t')
            counts = col[8].split(';')
            counts = int(counts[1].replace('size=', ''))
            if col[3] == '*':
                nohits.append(col[8])
                continue
            if float(col[3]) >= float(args.id):
                if not col[8] in ref_results:
                    ref_results[col[8]] = (col[9], col[3], counts)
                else:
                    print("Error: %s duplicated ID" % col[8])
            else:
                nohits.append(col[8])

    #summarize results from first ref clustering
    num_refcluster = len(ref_results)
    seqs_refcluster = 0
    for k, v in list(ref_results.items()):
        seqs_refcluster += v[2]
    amptklib.log.info("%i OTUs classified " % num_refcluster +
                      "({0:.0f}%".format(seqs_refcluster / float(qtrimtotal) *
                                         100) + " of reads)")

    #get ref clustered hits to file with taxonomy
    ref_clustered = os.path.join(tmp, base + '.ref_clustered.fa')
    with open(ref_clustered, 'w') as refoutput:
        with open(uchime_out, 'r') as input:
            otu_counter = 1
            for rec in SeqIO.parse(input, 'fasta'):
                if rec.id in ref_results:
                    res = ref_results.get(rec.id)
                    pident = res[1]
                    tax = res[0]
                    newID = 'OTU' + str(
                        otu_counter) + ';pident=' + pident + ';' + tax
                    rec.id = newID
                    rec.name = ''
                    rec.description = ''
                    SeqIO.write(rec, refoutput, 'fasta')
                    otu_counter += 1

    if not args.closed_ref_only:
        #get nohits file to run clustering
        utax_ref = os.path.join(tmp,
                                base + '.EE' + args.maxee + '.utax_ref.fa')
        with open(utax_ref, 'w') as output:
            with open(uchime_out, 'r') as input:
                for rec in SeqIO.parse(input, 'fasta'):
                    if rec.id in nohits:
                        SeqIO.write(rec, output, 'fasta')

        #input needs to be sorted, so
        ref_sort = os.path.join(tmp, base + '.utax_ref.sorted.fa')
        cmd = [
            'vsearch', '--sortbysize', utax_ref, '--minsize', args.minsize,
            '--output', ref_sort, '--threads',
            str(cpus)
        ]
        amptklib.runSubprocess(cmd, amptklib.log)

        #now run clustering algorithm on those not found in reference database
        radius = str(100 - int(args.pct_otu))
        otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa')
        amptklib.log.info("De novo Clustering remaining sequences (UPARSE)")
        cmd = [
            usearch, '-cluster_otus', ref_sort, '-relabel', 'OTU',
            '-otu_radius_pct', radius, '-otus', otu_out
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        total = amptklib.countfasta(otu_out)
        amptklib.log.info('{0:,}'.format(total) + ' de novo OTUs')

        #try utax reference clustering
        amptklib.log.info("Reference Clustering de novo OTUs using UTAX")
        cmd = [
            usearch, '-cluster_otus_utax', otu_out, '-db', utaxDB,
            '-utax_cutoff',
            str(args.utax_cutoff), '-utax_level', 's', '-strand', 'plus',
            '-utaxout',
            os.path.join(tmp, base + '.utax.out')
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        #setup tax filtering
        tax_values = ['k', 'p', 'c', 'o', 'f', 'g', 's']
        filter_index = tax_values.index(args.utax_level)
        filt_tax_values = [s + ':' for s in tax_values[filter_index:]]
        #get results from utax
        with open(ref_clustered, 'a') as output:
            seqDict = SeqIO.index(otu_out, 'fasta')
            utaxresults = []
            with open(os.path.join(tmp, base + '.utax.out'), 'r') as utax:
                for line in utax:
                    line = line.replace('\n', '')
                    col = line.split('\t')
                    ID = col[0]
                    tax = col[2]
                    if any(x in tax for x in filt_tax_values):
                        record = seqDict[ID]
                        record.id = 'OTU' + str(
                            otu_counter) + ';UTAX;tax=' + tax
                        record.name = ''
                        record.description = ''
                        SeqIO.write(record, output, 'fasta')
                        otu_counter += 1
        total = amptklib.countfasta(ref_clustered) - num_refcluster
        amptklib.log.info('{0:,}'.format(total) + ' classified to %s' %
                          taxonomyLookup.get(args.utax_level))

    #clean up padded N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.clean.otus.fa')
    amptklib.fasta_strip_padding(ref_clustered, otu_clean)
    total = amptklib.countfasta(otu_clean)
    amptklib.log.info('{0:,}'.format(total) + ' total OTUs')

    #now map reads back to OTUs
    uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(otu_clean, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)

    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("OTU Clustering Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Exemple #4
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-OTU_cluster.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UPARSE OTU clustering.
		Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="OTU Clustering Percent")
    parser.add_argument('-m',
                        '--minsize',
                        default='2',
                        help='Min size to keep for clustering')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument('--unoise',
                        action='store_true',
                        help='Run De-noising (UNOISE)')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-cluster.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output',
        derep_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #optional run UNOISE
    if args.unoise:
        unoise_out = unoise_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.denoised.fa')
        amptklib.log.info("Denoising Data with UNOISE")
        cmd = [
            usearch, '-cluster_fast', derep_out, '-centroids', unoise_out,
            '-id', '0.9', '--maxdiffs', '5', '-abskew', '10', '-sizein',
            '-sizeout', '-sort', 'size', '-threads',
            str(cpus)
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        total = amptklib.countfasta(unoise_out)
        amptklib.log.info('{0:,}'.format(total) + ' reads passed')
    else:
        unoise_out = derep_out

    #now sort by size remove singletons
    sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa')
    cmd = [
        'vsearch', '--sortbysize', unoise_out, '--minsize', args.minsize,
        '--output', sort_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #now run clustering algorithm
    radius = str(100 - int(args.pct_otu))
    otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa')
    amptklib.log.info("Clustering OTUs (UPARSE)")
    cmd = [
        usearch, '-cluster_otus', sort_out, '-relabel', 'OTU',
        '-otu_radius_pct', radius, '-otus', otu_out, '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    numOTUs = amptklib.countfasta(otu_out)
    amptklib.log.info('{0:,}'.format(numOTUs) + ' OTUs')

    #clean up padded N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.otus.fa')
    amptklib.fasta_strip_padding(otu_out, otu_clean)

    #optional UCHIME Ref
    if not args.uchime_ref:
        uchime_out = otu_clean
    else:
        uchime_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.uchime.otus.fa')
        #check if file is present, remove from previous run if it is.
        if os.path.isfile(uchime_out):
            os.remove(uchime_out)
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = otu_clean
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            if os.path.isfile(args.uchime_ref):
                uchime_db = os.path.abspath(args.uchime_ref)
            else:
                amptklib.log.error(
                    "%s is not a valid file, skipping reference chimera filtering"
                    % args.uchime_ref)
                uchime_out = otu_clean
        #now run chimera filtering if all checks out
        if not os.path.isfile(uchime_out):
            amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                              args.uchime_ref)
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean,
                '--db', uchime_db, '--nonchimeras', uchime_out, '--threads',
                str(cpus)
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(uchime_out)
            uchime_chimeras = numOTUs - total
            amptklib.log.info('{0:,}'.format(total) + ' OTUs passed, ' +
                              '{0:,}'.format(uchime_chimeras) +
                              ' ref chimeras')

    #Filter out OTUs in wrong orientation
    amptklib.log.info('Validating OTU orientation')
    passingOTUs = os.path.join(tmp, base + '.passed.otus.fa')
    numKept, numDropped = amptklib.validateorientation(tmp, sort_out,
                                                       uchime_out, passingOTUs)
    amptklib.log.info('{:,} OTUs validated ({:,} dropped)'.format(
        numKept, numDropped))

    #now map reads back to OTUs and build OTU table
    uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', passingOTUs, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(passingOTUs, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)
    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("OTU Clustering Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Exemple #5
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-dada2.py',
        description=
        '''Script takes output from amptk pre-processing and runs DADA2''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        required=True,
                        help='Input Demuxed containing FASTQ')
    parser.add_argument('-o', '--out', help='Output Basename')
    parser.add_argument(
        '-m',
        '--min_reads',
        default=10,
        type=int,
        help="Minimum number of reads after Q filtering to run DADA2 on")
    parser.add_argument('-l',
                        '--length',
                        type=int,
                        help='Length to truncate reads')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='MaxEE quality filtering')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="Biological OTU Clustering Percent")
    parser.add_argument('--platform',
                        default='ion',
                        choices=['ion', 'illumina', '454'],
                        help='Sequencing platform')
    parser.add_argument('--chimera_method',
                        default='consensus',
                        choices=['consensus', 'pooled', 'per-sample'],
                        help='bimera removal method')
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--pool',
                        action='store_true',
                        help='Pool all sequences together for DADA2')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep all intermediate files')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))
    dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R')

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.fastq:
            base = os.path.basename(args.fastq).split('.demux')[0]
        else:
            base = os.path.basename(args.fastq).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-dada2.log'
    if os.path.isfile(log_name):
        amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cores
    if args.cpus:
        CORES = str(args.cpus)
    else:
        CORES = str(amptklib.getCPUS())

    #check dependencies
    programs = ['Rscript']
    amptklib.CheckDependencies(programs)
    Rversions = amptklib.checkRversion()
    R_pass = '******'
    dada2_pass = '******'

    #check dada2 first, if good move on, otherwise issue warning
    if not amptklib.gvc(Rversions[1], dada2_pass):
        amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" %
                           (Rversions[0], Rversions[1], dada2_pass))
        amptklib.log.error(
            "See: http://benjjneb.github.io/dada2/dada-installation.html")
        sys.exit(1)
    amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1]))

    #Count FASTQ records and remove 3' N's as dada2 can't handle them
    amptklib.log.info("Loading FASTQ Records")
    no_ns = base + '.cleaned_input.fq'
    if args.fastq.endswith('.gz'):
        fastqInput = args.fastq.replace('.gz', '')
        amptklib.Funzip(os.path.abspath(args.fastq),
                        os.path.basename(fastqInput), CORES)
    else:
        fastqInput = os.path.abspath(args.fastq)
    amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns)
    demuxtmp = base + '.original.fa'
    cmd = [
        'vsearch', '--fastq_filter',
        os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(demuxtmp)
    size = amptklib.checkfastqsize(no_ns)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #quality filter
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    derep = base + '.qual-filtered.fq'
    filtercmd = [
        'vsearch', '--fastq_filter', no_ns, '--fastq_maxee',
        str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55',
        '--fastq_maxns', '0', '--threads', CORES
    ]
    amptklib.runSubprocess(filtercmd, amptklib.log)
    total = amptklib.countfastq(derep)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #split into individual files
    amptklib.log.info("Splitting FASTQ file by Sample into individual files")
    filtfolder = base + '_filtered'
    if os.path.isdir(filtfolder):
        shutil.rmtree(filtfolder)
    os.makedirs(filtfolder)
    splitDemux2(derep, filtfolder, args=args)

    #check for minimum number of reads in each sample
    remove = []
    files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')]
    for x in files:
        if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads:
            remove.append(x)
    if len(remove) > 0:
        amptklib.log.info("Dropping %s as fewer than %i reads" %
                          (', '.join(remove), args.min_reads))
        for y in remove:
            os.remove(os.path.join(filtfolder, y))

    #now run DADA2 on filtered folder
    amptklib.log.info("Running DADA2 pipeline")
    dada2log = base + '.dada2.Rscript.log'
    dada2out = base + '.dada2.csv'
    #check pooling vs notpooled, default is not pooled.
    if args.pool:
        POOL = 'TRUE'
    else:
        POOL = 'FALSE'
    with open(dada2log, 'w') as logfile:
        subprocess.call([
            'Rscript', '--vanilla', dada2script, filtfolder, dada2out,
            args.platform, POOL, CORES, args.chimera_method
        ],
                        stdout=logfile,
                        stderr=logfile)

    #check for results
    if not os.path.isfile(dada2out):
        amptklib.log.error("DADA2 run failed, please check %s logfile" %
                           dada2log)
        sys.exit(1)

    #now process the output, pull out fasta, rename, etc
    fastaout = base + '.otus.tmp'
    OTUCounts = {}
    counter = 1
    with open(fastaout, 'w') as writefasta:
        with open(dada2out, 'r') as input:
            next(input)
            for line in input:
                line = line.replace('\n', '')
                line = line.replace('"', '')
                cols = line.split(',')
                Seq = cols[0]
                countList = [int(x) for x in cols[1:]]
                counts = sum(countList)
                ID = 'ASV' + str(counter)
                if not ID in OTUCounts:
                    OTUCounts[ID] = counts
                writefasta.write(">%s\n%s\n" % (ID, Seq))
                counter += 1

    #get number of bimeras from logfile
    with open(dada2log, 'r') as bimeracheck:
        for line in bimeracheck:
            if line.startswith('Identified '):
                bimeraline = line.split(' ')
                bimeras = int(bimeraline[1])
                totalSeqs = int(bimeraline[5])
    validSeqs = totalSeqs - bimeras
    amptklib.log.info('{0:,}'.format(totalSeqs) +
                      ' total amplicon sequence variants (ASVs)')
    amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed')
    amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs')

    #optional UCHIME Ref
    uchime_out = base + '.nonchimeras.fa'
    chimeraFreeTable = base + '.otu_table.txt'
    iSeqs = base + '.ASVs.fa'
    if not args.uchime_ref:
        os.rename(fastaout, iSeqs)
    else:
        #check if file is present, remove from previous run if it is.
        if os.path.isfile(iSeqs):
            amptklib.removefile(iSeqs)
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = fastaout
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            if os.path.isfile(args.uchime_ref):
                uchime_db = os.path.abspath(args.uchime_ref)
            else:
                amptklib.log.error(
                    "%s is not a valid file, skipping reference chimera filtering"
                    % args.uchime_ref)
                iSeqs = fastaout
        #now run chimera filtering if all checks out
        if not os.path.isfile(iSeqs):
            amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                              args.uchime_ref)
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db',
                uchime_db, '--nonchimeras', iSeqs, '--threads', CORES
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(iSeqs)
            uchime_chimeras = validSeqs - total
            amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' +
                              '{0:,}'.format(uchime_chimeras) +
                              ' ref chimeras removed')
            if os.path.isfile(fastaout):
                amptklib.removefile(fastaout)

    #setup output files
    dadademux = base + '.dada2.map.uc'
    bioSeqs = base + '.cluster.otus.fa'
    bioTable = base + '.cluster.otu_table.txt'
    uctmp = base + '.map.uc'
    ClusterComp = base + '.ASVs2clusters.txt'

    #Filter out ASVs in wrong orientation
    amptklib.log.info('Validating ASV orientation')
    os.rename(iSeqs, iSeqs + '.bak')
    numKept, numDropped = amptklib.validateorientationDADA2(
        OTUCounts, iSeqs + '.bak', iSeqs)
    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(
        numKept, numDropped))
    amptklib.SafeRemove(iSeqs + '.bak')

    #map reads to DADA2 OTUs
    amptklib.log.info("Mapping reads to DADA2 ASVs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97',
        '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(dadademux)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #cluster
    amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" %
                      args.pct_otu)
    radius = float(args.pct_otu) / 100.
    cmd = [
        'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id',
        str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none',
        '--usersort', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(bioSeqs)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')

    #determine where iSeqs clustered
    iSeqmap = base + '.ASV_map.uc'
    cmd = [
        'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id',
        str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    iSeqMapped = {}
    with open(iSeqmap, 'r') as mapping:
        for line in mapping:
            line = line.replace('\n', '')
            cols = line.split('\t')
            OTU = cols[9]
            Hit = cols[8]
            if not OTU in iSeqMapped:
                iSeqMapped[OTU] = [Hit]
            else:
                iSeqMapped[OTU].append(Hit)
    with open(ClusterComp, 'w') as clusters:
        clusters.write('OTU\tASVs\n')
        for k, v in natsorted(list(iSeqMapped.items())):
            clusters.write('%s\t%s\n' % (k, ', '.join(v)))
    #create OTU table
    amptklib.log.info("Mapping reads to OTUs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id',
        '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(uctmp)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    if not args.debug:
        amptklib.removefile(no_ns)
        shutil.rmtree(filtfolder)
        amptklib.removefile(dada2out)
        amptklib.removefile(derep)
        amptklib.removefile(demuxtmp)
        amptklib.removefile(uctmp)
        amptklib.removefile(iSeqmap)
        amptklib.removefile(dadademux)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("DADA2 Script has Finished Successfully")
    print("-------------------------------------------------------")
    if args.debug:
        print("Tmp Folder of files: %s" % filtfolder)
    print("Amplicon sequence variants: %s" % iSeqs)
    print("ASV OTU Table: %s" % chimeraFreeTable)
    print("Clustered OTUs: %s" % bioSeqs)
    print("OTU Table: %s" % bioTable)
    print("ASVs 2 OTUs: %s" % ClusterComp)
    print("-------------------------------------------------------")

    otu_print = bioSeqs.split('/')[-1]
    tab_print = bioTable.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Exemple #6
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-unoise2.py',
        usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu",
        description='''Script runs UNOISE2 algorithm.
		Requires USEARCH9 by Robert C. Edgar: http://drive5.com/usearch''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        dest="FASTQ",
                        required=True,
                        help='FASTQ file (Required)')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='Quality trim EE value')
    parser.add_argument('-m',
                        '--minsize',
                        default='8',
                        help='Min size to keep for denoising')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="Biological OTU Clustering Percent")
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME2 REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--map_filtered',
                        action='store_true',
                        help='map quality filtered reads back to OTUs')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.FASTQ:
            base = os.path.basename(args.FASTQ).split('.demux')[0]
        else:
            base = os.path.basename(args.FASTQ).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-unoise2.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cpus
    if args.cpus:
        cpus = args.cpus
    else:
        cpus = amptklib.getCPUS()

    #make tmp folder
    tmp = base + '_tmp'
    if not os.path.exists(tmp):
        os.makedirs(tmp)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    #convert to FASTA for mapping
    orig_fasta = os.path.join(tmp, base + '.orig.fa')
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(orig_fasta)
    size = amptklib.checkfastqsize(args.FASTQ)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #Expected Errors filtering step
    filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq')
    filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa')
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    cmd = [
        'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
        str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
        '--fastq_qmax', '55', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfastq(filter_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run full length dereplication
    derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa')
    amptklib.log.info("De-replication (remove duplicate reads)")
    cmd = [
        'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_',
        '--sizeout', '--output', derep_out, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(derep_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #now run de-noiser UNOISE2
    amptklib.log.info("Denoising reads with UNOISE2")
    unoise_out = os.path.join(tmp, base + '.EE' + args.maxee + '.unoise.fa')
    cmd = [
        usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '-minampsize',
        args.minsize, '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(unoise_out)
    amptklib.log.info('{0:,}'.format(total) + ' denoised sequences')

    #strip N's
    amptklib.log.info("Cleaning up padding from OTUs")
    otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.fa')
    amptklib.fasta_strip_padding(unoise_out, otu_clean)

    #run optional uchime_ref
    if not args.uchime_ref:
        uchime_out = otu_clean
    else:
        uchime_out = os.path.join(
            tmp, base + '.EE' + args.maxee + '.uchime.otus.fa')
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = otu_clean
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            uchime_db = os.path.abspath(args.uchime_ref)
        #now run chimera filtering if all checks out
        if not os.path.isfile(uchime_out):
            amptklib.log.info("Chimera Filtering (VSEARCH)")
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean,
                '--db', uchime_db, '--nonchimeras', uchime_out, '--threads',
                str(cpus)
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(uchime_out)
            amptklib.log.info('{0:,}'.format(total) + ' OTUs passed')

    #inferred sequences
    iSeqs = base + '.ASVs.fa'
    amptklib.fastarename(uchime_out, 'ASV', iSeqs)

    #Filter out ASVs in wrong orientation
    amptklib.log.info('Validating ASV orientation')
    passingOTUs = os.path.join(tmp, base + '.passed.asvs.fa')
    numKept, numDropped = amptklib.validateorientation(tmp, derep_out,
                                                       uchime_out, passingOTUs)
    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(
        numKept, numDropped))

    #build OTU table with iSeqs
    uc_iSeq_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc')
    iSeq_otu_table = base + '.otu_table.txt'
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to ASVs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', passingOTUs, '--uc', uc_iSeq_out, '--otutabout',
        iSeq_otu_table, '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_iSeq_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #now cluster to biological OTUs with UCLUST
    radius = float(args.pct_otu) / 100.
    amptklib.log.info(
        "Clustering denoised sequences into biological OTUs at %s%%" %
        args.pct_otu)
    uclust_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uclust.fa')
    cmd = [
        'vsearch', '--cluster_smallmem', passingOTUs, '--centroids',
        uclust_out, '--id',
        str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none',
        '--usersort', '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(uclust_out)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')

    #determine where denoised sequences clustered
    ClusterComp = base + '.ASVs2clusters.txt'
    iSeqmap = base + '.unoise_map.uc'
    cmd = [
        usearch, '-usearch_global', passingOTUs, '-db', uclust_out, '-id',
        str(radius), '-uc', iSeqmap, '-strand', 'plus', '-threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    iSeqMapped = {}
    with open(iSeqmap, 'r') as mapping:
        for line in mapping:
            line = line.replace('\n', '')
            cols = line.split('\t')
            OTU = cols[9]
            Hit = cols[8]
            if not OTU in iSeqMapped:
                iSeqMapped[OTU] = [Hit]
            else:
                iSeqMapped[OTU].append(Hit)
    with open(ClusterComp, 'w') as clusters:
        clusters.write('OTU\tASVs\n')
        for k, v in natsorted(list(iSeqMapped.items())):
            clusters.write('%s\t%s\n' % (k, ', '.join(v)))

    #now map reads back to OTUs and build OTU table
    uc_out = os.path.join(tmp,
                          base + '.EE' + args.maxee + '.cluster.mapping.uc')
    otu_table = os.path.join(
        tmp, base + '.EE' + args.maxee + '.cluster.otu_table.txt')
    #setup reads to map
    if args.map_filtered:
        reads = filter_fasta
    else:
        reads = orig_fasta
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    cmd = [
        'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id',
        '0.97', '--db', uclust_out, '--uc', uc_out, '--otutabout', otu_table,
        '--threads',
        str(cpus)
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count reads mapped
    total = amptklib.line_count2(uc_out)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Move files around, delete tmp if argument passed.
    currentdir = os.getcwd()
    final_otu = os.path.join(currentdir, base + '.cluster.otus.fa')
    shutil.copyfile(uclust_out, final_otu)
    final_otu_table = os.path.join(currentdir, base + '.cluster.otu_table.txt')
    shutil.copyfile(otu_table, final_otu_table)
    if not args.debug:
        shutil.rmtree(tmp)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("UNOISE2 Script has Finished Successfully")
    print("-------------------------------------------------------")
    if not not args.debug:
        print("Tmp Folder of files: %s" % tmp)
    print("Amplicon sequence variants: %s" % passingOTUs)
    print("ASV OTU Table: %s" % iSeq_otu_table)
    print("Clustered OTUs: %s" % os.path.basename(final_otu))
    print("OTU Table: %s" % os.path.basename(final_otu_table))
    print("ASVs 2 OTUs: %s" % ClusterComp)
    print("-------------------------------------------------------")

    otu_print = final_otu.split('/')[-1]
    tab_print = final_otu_table.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Exemple #7
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-drop.py',
        description='''Script that drops OTUs and then creates OTU table''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='OTUs in FASTA format')
    parser.add_argument('-r',
                        '--reads',
                        required=True,
                        help='Demuxed reads FASTQ format')
    parser.add_argument('-o', '--out', help='Base output name')
    parser.add_argument('-l',
                        '--list',
                        nargs='+',
                        help='Input list of (BC) names to remove')
    parser.add_argument('-f',
                        '--file',
                        help='File containing list of names to remove')
    args = parser.parse_args(args)

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'otus' in args.input:
            base = os.path.basename(args.input).split('.otus')[0]
        else:
            base = os.path.basename(args.input).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-drop.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()

    #check the list or file parameters, one of them must have something
    if not args.list:
        if not args.file:
            amptklib.log.error(
                "Error, you must specifiy a list of OTU names or a file containing names"
            )
            sys.exit(1)
    if not args.file:
        if not args.list:
            amptklib.log.error(
                "Error, you must specifiy a list of OTU names or a file containing names"
            )
            sys.exit(1)
    if args.list and args.file:
        amptklib.log.error(
            "Error, you must specifiy either list of OTU names or a file containing OTU names, not both"
        )
        sys.exit(1)
    if args.file:
        count = amptklib.line_count(args.file)
        #load in list of names to remove
        with open(args.file, 'r') as input:
            lines = [line.rstrip('\n') for line in input]
    if args.list:
        count = len(args.list)
        lines = args.list
    #make sure it is a set, faster lookup
    dropList = set(lines)

    #load data
    total = amptklib.countfasta(args.input)
    amptklib.log.info("Loading %i OTUs" % total)

    #load in the fasta file, change if in dictionary and output to stdout
    amptklib.log.info("Dropping %i OTUs" % count)
    newOTUs = base + '.cleaned.otus.fa'
    with open(newOTUs, 'w') as otus:
        with open(args.input, 'r') as fasta:
            for rec in SeqIO.parse(fasta, 'fasta'):
                if not rec.id in dropList:
                    SeqIO.write(rec, otus, 'fasta')

    #now make new OTU table
    amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
    newTable = base + '.cleaned.otu_table.txt'
    tmpReads = base + '.reads.tmp'
    uc_out = base + '.mapping.uc'
    cmd = [
        'vsearch', '--fastq_filter', args.reads, '--fastaout', tmpReads,
        '--fastq_qmax', '55'
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    cmd = [
        'vsearch', '--usearch_global', tmpReads, '--strand', 'plus', '--id',
        '0.97', '--db', newOTUs, '--uc', uc_out, '--otutabout', newTable
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #count OTUs
    otu_count = amptklib.countfasta(newOTUs)
    amptklib.log.info('{0:,}'.format(otu_count) + ' OTUs remaining')

    #count reads mapped
    total = amptklib.line_count(uc_out)
    orig_total = amptklib.countfasta(tmpReads)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("Clustered OTUs: %s" % newOTUs)
    print("OTU Table: %s" % newTable)
    print("-------------------------------------------------------")

    #cleanup
    amptklib.removefile(tmpReads)
    amptklib.removefile(uc_out)

    otu_print = newOTUs.split('/')[-1]
    tab_print = newTable.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Exemple #8
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-lulu.py',
        description=
        '''Script runs OTU table post processing LULU to identify low abundance error OTUs''',
        epilog="""Written by Jon Palmer (2018) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--otu_table',
                        required=True,
                        help='Input OTU table')
    parser.add_argument('-f',
                        '--fasta',
                        required=True,
                        help='Input OTUs (multi-fasta)')
    parser.add_argument('-o', '--out', help='Output folder basename')
    parser.add_argument('--min_ratio_type',
                        default='min',
                        choices=['min', 'avg'],
                        help="LULU minimum ratio threshold")
    parser.add_argument('--min_ratio',
                        default=1,
                        type=int,
                        help="LULU minimum ratio")
    parser.add_argument('--min_match',
                        default=84,
                        type=int,
                        help="LULU minimum match percent identity")
    parser.add_argument('--min_relative_cooccurence',
                        default=95,
                        type=int,
                        help="LULU minimum relative cooccurance")
    parser.add_argument('--debug',
                        action='store_true',
                        help='Remove Intermediate Files')
    args = parser.parse_args(args)

    #get location of R script
    parentdir = os.path.join(os.path.dirname(amptklib.__file__))
    luluScript = os.path.join(parentdir, 'runLULU.R')

    if not args.out:
        #get base name of files
        if 'otu_table' in args.otu_table:
            base = os.path.basename(args.otu_table).split(".otu_table")[0]
        elif 'final.txt' in args.otu_table:
            base = os.path.basename(args.otu_table).split(".final")[0]
        else:
            base = os.path.basename(args.otu_table).split(".txt")[0]
    else:
        base = args.out

    #remove logfile if exists
    log_name = base + '.amptk-lulu.log'
    if os.path.isfile(log_name):
        amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    amptklib.versionDependencyChecks('usearch9')
    #check dependencies
    programs = ['Rscript', 'vsearch']
    amptklib.CheckDependencies(programs)
    Rversions = amptklib.checkRversion()
    if Rversions[3] == '0.0.0':
        amptklib.log.info("R v%s installed, LULU not installed")
        sys.exit(1)
    else:
        amptklib.log.info("R v%s; LULU v%s" % (Rversions[0], Rversions[3]))

    #this is a simple wrapper for an R script so easier to run from amptk menu
    tmpdir = 'lulu_' + str(os.getpid())
    if not os.path.isdir(tmpdir):
        os.makedirs(tmpdir)

    #generate the match list using the minimum match pident
    match_file = os.path.join(tmpdir, 'match_file.txt')
    amptklib.log.info("Loading {:,} OTUs".format(
        amptklib.countfasta(args.fasta)))
    amptklib.log.info(
        "Generating pairwise percent identity between OTUs using VSEARCH at {:}% identity"
        .format(args.min_match))
    cmd = [
        'vsearch', '--usearch_global',
        os.path.abspath(args.fasta), '--db',
        os.path.abspath(args.fasta), '--self', '--id',
        str(args.min_match / 100), '--iddef', '1', '--userout', match_file,
        '--userfields', 'query+target+id', '--maxaccepts', '0', '--query_cov',
        '.9', '--maxhits', '10'
    ]
    amptklib.runSubprocess(cmd, amptklib.log)

    #now run LULU in R
    LULU_log = os.path.join(tmpdir, 'LULU-R.log')
    lulu_otu_table = base + '.lulu.otu_table.txt'
    dropList = os.path.join(tmpdir, 'droplist.txt')
    MapData = base + '.lulu.otu-map.txt'
    amptklib.log.info("Running LULU algorithm")
    cmd = [
        'Rscript', '--vanilla', luluScript,
        os.path.abspath(args.otu_table),
        os.path.abspath(match_file), args.min_ratio_type,
        str(args.min_ratio),
        str(args.min_match),
        str(args.min_relative_cooccurence / 100), lulu_otu_table, dropList,
        MapData
    ]
    amptklib.runSubprocess4(cmd, amptklib.log, LULU_log)

    #get updated OTUs
    remove = []
    with open(dropList, 'rU') as dropped:
        for line in dropped:
            remove.append(line.rstrip())
    lulu_otus = base + '.lulu.otus.fa'
    with open(lulu_otus, 'w') as output:
        with open(args.fasta, 'rU') as infasta:
            for record in SeqIO.parse(infasta, 'fasta'):
                if not record.id in remove:
                    output.write('>%s\n%s\n' % (record.id, record.seq))
    amptklib.log.info(
        "LULU has merged {:,} OTUs, output data contains {:,} OTUs".format(
            len(remove), amptklib.countfasta(lulu_otus)))
    amptklib.log.info("LULU OTU table post processing finished\n\
----------------------------------\n\
OTU table:  {:}\n\
OTU FASTA:  {:}\n\
LULU map:   {:}\n\
----------------------------------".format(lulu_otu_table, lulu_otus, MapData))
    if 'win32' in sys.platform:
        print(
            "\nExample of next cmd: amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n"
            % (lulu_otus, lulu_otu_table))
    else:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk taxonomy -f %s -i %s -m mapping_file.txt -d ITS2\n" %
              (lulu_otus, lulu_otu_table))
    if not args.debug:
        if os.path.isdir(tmpdir):
            shutil.rmtree(tmpdir)
    print("-------------------------------------------------------")