Esempio n. 1
0
def makeDB(input):
    db_details = args.out + '.udb.txt'
    usearch_db = args.out + '.udb'
    Total = amptklib.countfasta(input)
    if args.trimming:
        args.F_primer = 'None'
        args.R_primer = 'None'
    db_string = args.create_db + ' ' + args.fasta + ' ' + args.F_primer + ' ' + args.R_primer + ' ' + str(
        Total)
    with open(db_details, 'w') as details:
        details.write(db_string)
    report = args.out + '.report.txt'

    if args.create_db == 'utax':
        #create log file for this to troubleshoot
        utax_log = args.out + '.utax.log'
        if os.path.isfile(utax_log):
            os.remove(utax_log)
        amptklib.log.info("Creating UTAX Database, this may take awhile")
        amptklib.log.debug(
            "%s -makeudb_utax %s -output %s -report %s -utax_trainlevels kpcofgs -utax_splitlevels NVkpcofgs -notrunclabels"
            % (usearch, input, usearch_db, report))
        with open(utax_log, 'w') as utaxLog:
            subprocess.call([
                usearch, '-makeudb_utax', input, '-output', usearch_db,
                '-report', report, '-utax_trainlevels', args.utax_trainlevels,
                '-utax_splitlevels', args.utax_splitlevels, '-notrunclabels'
            ],
                            stdout=utaxLog,
                            stderr=utaxLog)

        #check if file is actually there
        if os.path.isfile(usearch_db):
            amptklib.log.info("Database %s created successfully" % usearch_db)
        else:
            amptklib.log.error(
                "There was a problem creating the DB, check the UTAX log file %s"
                % utax_log)

    if args.create_db == 'usearch':
        #create log file for this to troubleshoot
        usearch_log = args.out + '.usearch.log'
        if os.path.isfile(usearch_log):
            os.remove(usearch_log)
        amptklib.log.info("Creating USEARCH Database")
        amptklib.log.debug("%s -makeudb_usearch %s -output %s -notrunclabels" %
                           (usearch, input, usearch_db))
        with open(usearch_log, 'w') as logfile:
            subprocess.call([
                usearch, '-makeudb_usearch', input, '-output', usearch_db,
                '-notrunclabels'
            ],
                            stdout=logfile,
                            stderr=logfile)
        if os.path.isfile(usearch_db):
            amptklib.log.info("Database %s created successfully" % usearch_db)
        else:
            amptklib.log.error(
                "There was a problem creating the DB, check the log file %s" %
                utax_log)
Esempio n. 2
0
                continue
            if taxCount > 5:  #species level, not have more than 1 seq per "species"
                if ID in seenTax:
                    continue
            out.write('>' + value + '\n' + key + '\n')
            seenTax.append(tax)
            seenID.append(ID)


FNULL = open(os.devnull, 'w')
pid = os.getpid()
#reverse complement rev primer
ForPrimer = args.fwdprimer
RevPrimer = revcomp_lib.RevComp(args.revprimer)

print 'Loading ' + '{0:,}'.format(amptklib.countfasta(
    args.input)) + ' sequence records'
print 'Searching for forward primer: %s, and reverse primer: %s' % (ForPrimer,
                                                                    RevPrimer)
print 'Requiring reverse primer match with at least %i mismatches' % args.primer_mismatch
#loop through seqs, remove primer if found, and truncate to length
truncated = 'bold2amptk_' + str(pid) + '.truncate.tmp'
with open(truncated, 'w') as output:
    for record in FastaIterator(open(args.input)):
        Seq = str(record.seq)
        StripSeq = ''
        ForCutPos = amptklib.findFwdPrimer(ForPrimer, Seq,
                                           args.primer_mismatch,
                                           amptklib.degenNucSimple)
        RevCutPos = amptklib.findRevPrimer(RevPrimer, Seq,
                                           args.primer_mismatch,
                                           amptklib.degenNucSimple)
Esempio n. 3
0
            uchime_db = os.path.abspath(args.uchime_ref)
        else:
            amptklib.log.error(
                "%s is not a valid file, skipping reference chimera filtering"
                % args.uchime_ref)
            uchime_out = fastaout
    #now run chimera filtering if all checks out
    if not os.path.isfile(uchime_out):
        amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                          args.uchime_ref)
        cmd = [
            'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db',
            uchime_db, '--nonchimeras', uchime_out
        ]
        amptklib.runSubprocess(cmd, amptklib.log)
        total = amptklib.countfasta(uchime_out)
        uchime_chimeras = validSeqs - total
        amptklib.log.info('{0:,}'.format(total) + ' iSeqs passed, ' +
                          '{0:,}'.format(uchime_chimeras) +
                          ' ref chimeras removed')

    #now reformat OTUs and OTU table, dropping chimeric OTUs from table, sorting the output as well
    nonchimeras = amptklib.fasta2list(uchime_out)
    inferredSeqs = SeqIO.index(uchime_out, 'fasta')
    with open(iSeqs, 'w') as iSeqout:
        for x in natsorted(nonchimeras):
            SeqIO.write(inferredSeqs[x], iSeqout, 'fasta')
    if not args.debug:
        #clean up chimeras fasta
        amptklib.removefile(uchime_out)
        if os.path.isfile(fastaout):
Esempio n. 4
0
    utaxDB = DataBase.get(args.db)[1]
else:
    if not args.closed_ref_only:
        if args.utax_db:
            utaxDB = os.path.abspath(args.utax_db)
        else:
            amptklib.log.error("%s not pre-installed DB, must then also specify valid UTAX database via --utax_db" % args.db)
            sys.exit(1)

#Count FASTQ records
amptklib.log.info("Loading FASTQ Records")
#convert to FASTA for mapping
orig_fasta = os.path.join(tmp, args.out+'.orig.fa')
cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55']
amptklib.runSubprocess(cmd, amptklib.log)
orig_total = amptklib.countfasta(orig_fasta)
size = amptklib.checkfastqsize(args.FASTQ)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#Expected Errors filtering step
filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55']
amptklib.runSubprocess(cmd, amptklib.log)
qtrimtotal = amptklib.countfastq(filter_out)
amptklib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed')
#now run full length dereplication
derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa')
amptklib.log.info("De-replication (remove duplicate reads)")
Esempio n. 5
0
        amptklib.log.error("If using the -b,--barcode option you must specify a fasta file of mock community via the --mc option")
        sys.exit(1)
    #get default mock community value
    if args.mc == "mock3":
        mock = os.path.join(parentdir, 'DB', 'amptk_mock3.fa')
    elif args.mc == "mock2":
        mock = os.path.join(parentdir, 'DB', 'amptk_mock2.fa')
    elif args.mc == "mock1":
        mock = os.path.join(parentdir, 'DB', 'amptk_mock1.fa')
    elif args.mc == "synmock":
        mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa')
    else:
        mock = os.path.abspath(args.mc)

    #open mock community fasta and count records
    mock_ref_count = amptklib.countfasta(mock)
    
    #load OTU lengths into dictionary
    SeqLength = amptklib.fastalen2dict(args.fasta)
    
    #map OTUs to mock community, this is fast enough, but running twice, first to get only top hit, then
    amptklib.log.info("Mapping OTUs to Mock Community (USEARCH)")
    cmd = [usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.65', '-db', FastaCounts, '-userout', mock_out, '-userfields', 'query+target+id+ql+tl+alnlen+caln+mism+diffs', '-maxaccepts', '0', '-maxrejects', '0']
    amptklib.runSubprocess(cmd, amptklib.log)

    #generate dictionary for name change
    '''
    If args.calculate is set to all, that means the script is trying to measure a synthetic
    mock of some kind.  if that is the case, then chimeras are < 95% identical to mock members
    and variants would be hits in between, i.e 95% > but not the best hit.
    '''
Esempio n. 6
0
if args.add2db:  #means user wants to add sequences to the usearch database on the fly, so we will grab sintax DB here, as not preformatted
    amptklib.log.info("Adding %s to database" % args.add2db)
    custom_db = base + '.custom_database.fa'
    if args.db:  #this means that the fasta files are in sintax_db option
        current_db = sintax_db
    elif args.fasta_db:
        current_db = args.fasta_db
    with open(custom_db, 'wb') as outfile:
        with open(current_db, 'rU') as infile:
            shutil.copyfileobj(infile, outfile)
        with open(args.add2db, 'rU') as infile:
            shutil.copyfileobj(infile, outfile)

#Count records
amptklib.log.info("Loading FASTA Records")
total = amptklib.countfasta(args.fasta)
amptklib.log.info('{0:,}'.format(total) + ' OTUs')

#declare output files/variables here
blast_out = base + '.blast.txt'
rdp_out = base + '.rdp.txt'
utax_out = base + '.usearch.txt'
usearch_out = base + '.usearch.txt'
sintax_out = base + '.sintax.txt'

if not args.taxonomy:
    #start with less common uses, i.e. Blast, rdp
    if args.method == 'blast':
        #check if command line blast installed
        if not amptklib.which('blastn'):
            amptklib.log.error("BLASTN not found in your PATH, exiting.")
Esempio n. 7
0
#make tmp folder
tmp = args.out + '_tmp'
if not os.path.exists(tmp):
    os.makedirs(tmp)

#Count FASTQ records
amptklib.log.info("Loading FASTQ Records")
#convert to FASTA for mapping
orig_fasta = os.path.join(tmp, args.out + '.orig.fa')
cmd = [
    'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
    '--fastq_qmax', '55'
]
amptklib.runSubprocess(cmd, amptklib.log)
orig_total = amptklib.countfasta(orig_fasta)
size = amptklib.checkfastqsize(args.FASTQ)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#Expected Errors filtering step
filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = [
    'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
    str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
    '--fastq_qmax', '55'
]
amptklib.runSubprocess(cmd, amptklib.log)
total = amptklib.countfastq(filter_out)
Esempio n. 8
0
        (FwdPrimer, RevPrimer))
else:
    amptklib.log.info("Working on file: %s" % args.fasta)

if not args.cpus:
    cpus = multiprocessing.cpu_count()
else:
    cpus = args.cpus

#create temp directory
pid = os.getpid()
folder = 'amptk_tmp_' + str(pid)
if not os.path.exists(folder):
    os.makedirs(folder)

SeqCount = amptklib.countfasta(args.fasta)
amptklib.log.info('{0:,}'.format(SeqCount) + ' records loaded')
#if only 1 cpu just process data
if cpus == 1:
    stripPrimer(args.fasta)
else:
    amptklib.log.info("Using %i cpus to process data" % cpus)

    #now split it into chunks (as many cpus as are queried)
    amptklib.split_fasta(args.fasta, folder, cpus * 2)

    #get list of files
    file_list = []
    for file in os.listdir(folder):
        if file.endswith(".fasta"):
            file = os.path.join(folder, file)
Esempio n. 9
0
#make tmp folder
tmp = args.out + '_tmp'
if not os.path.exists(tmp):
    os.makedirs(tmp)

#Count FASTQ records
amptklib.log.info("Loading FASTQ Records")
#convert to FASTA for mapping
orig_fasta = os.path.join(tmp, args.out + '.orig.fa')
cmd = [
    'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta,
    '--fastq_qmax', '55'
]
amptklib.runSubprocess(cmd, amptklib.log)
orig_total = amptklib.countfasta(orig_fasta)
size = amptklib.checkfastqsize(args.FASTQ)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#Expected Errors filtering step
filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
cmd = [
    'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee',
    str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta,
    '--fastq_qmax', '55'
]
amptklib.runSubprocess(cmd, amptklib.log)
total = amptklib.countfastq(filter_out)
Esempio n. 10
0
dada2_pass = '******'
#check dada2 first, if good move on, otherwise issue warning
if not amptklib.gvc(Rversions[1], dada2_pass):
    amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass))
    amptklib.log.error("See: http://benjjneb.github.io/dada2/dada-installation.html")
    sys.exit(1)
amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1]))

#Count FASTQ records and remove 3' N's as dada2 can't handle them
amptklib.log.info("Loading FASTQ Records")
no_ns = args.out+'.cleaned_input.fq'
amptklib.fastq_strip_padding(args.fastq, no_ns)
demuxtmp = args.out+'.original.fa'
cmd = ['vsearch', '--fastq_filter', os.path.abspath(no_ns),'--fastq_qmax', '55', '--fastaout', demuxtmp]
amptklib.runSubprocess(cmd, amptklib.log)
orig_total = amptklib.countfasta(demuxtmp)
size = amptklib.checkfastqsize(no_ns)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#quality filter
amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
derep = args.out+'.qual-filtered.fq'
filtercmd = ['vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0']
amptklib.runSubprocess(filtercmd, amptklib.log)
total = amptklib.countfastq(derep)
amptklib.log.info('{0:,}'.format(total) + ' reads passed')

#split into individual files
amptklib.log.info("Splitting FASTQ file by Sample into individual files")
filtfolder = args.out+'_filtered'
Esempio n. 11
0
        "Error, you must specifiy either list of OTU names or a file containing OTU names, not both"
    )
    sys.exit(1)
if args.file:
    count = amptklib.line_count(args.file)
    #load in list of names to remove
    with open(args.file, 'rU') as input:
        lines = [line.rstrip('\n') for line in input]
if args.list:
    count = len(args.list)
    lines = args.list
#make sure it is a set, faster lookup
dropList = set(lines)

#load data
total = amptklib.countfasta(args.input)
amptklib.log.info("Loading %i OTUs" % total)

#load in the fasta file, change if in dictionary and output to stdout
amptklib.log.info("Dropping %i OTUs" % count)
newOTUs = args.out + '.cleaned.otus.fa'
with open(newOTUs, 'w') as otus:
    with open(args.input, 'rU') as fasta:
        for rec in SeqIO.parse(fasta, 'fasta'):
            if not rec.id in dropList:
                SeqIO.write(rec, otus, 'fasta')

#now make new OTU table
amptklib.log.info("Mapping Reads to OTUs and Building OTU table")
newTable = args.out + '.cleaned.otu_table.txt'
tmpReads = args.out + '.reads.tmp'