def startjob(indir,resultdir,skip="",checkpoint=False,workflow=1,refdb="",cpu=1,concat=False, model="GTR",bs=0,kf=False,maxmlst=100,maxorg=50,filtMLST=True,refdir="",fast=False,minmlst=10): #Setup working directory if not os.path.exists(os.path.join(os.path.realpath(resultdir),"queryseqs")): os.makedirs(os.path.join(os.path.realpath(resultdir),"queryseqs")) #query sequence folder #Read checkpoint from log if exists or use checkpoint parameter if not checkpoint and os.path.exists(os.path.join(resultdir,"automlst.log")): with open(os.path.join(resultdir,"automlst.log"),"r") as fil: for line in fil: x = line.strip().split("::") if "JOB_CHECKPOINT" in x[0]: checkpoint = x[1] elif "WORKFLOW" in x[0]: workflow = int(x[1]) #Start log global log log = setlog.init(os.path.join(resultdir,"automlst.log"),toconsole=True) #ensure bootsrtap value is valid and no more than 1000 try: bs = int(bs) if bs > 1000: bs = 1000 elif bs < 0: bs = 0 except ValueError: log.warning("Invalid bootsrap value found, setting to 0") bs = 0 log.debug("START / RESUME JOB last checkpoint: %s"%checkpoint) log.info('JOB_PARAMS::{"resultdir":"%s","skip":"%s","workflow":%s,"concat":"%s","model":"%s"}'%(resultdir,skip,workflow,concat,model)) if workflow == 1: log.info("WORKFLOW::1") try: return startwf1(indir,resultdir,checkpoint=checkpoint,skip=skip,refdb=refdb,cpu=cpu,concat=concat, model=model,bs=bs,kf=kf,maxmlst=maxmlst,maxorg=maxorg,filtMLST=filtMLST,fast=fast,minmlst=minmlst) except Exception as e: log.error("Unexpected failure: %s"%e) raise elif workflow == 2: log.info("WORKFLOW::2") try: return startwf2(indir,resultdir,refdir=refdir,checkpoint=checkpoint,model=model,bs=bs,kf=kf,maxmlst=maxmlst,cpu=cpu,fast=fast) except Exception as e: log.error("Unexpected failure: %s"%e) raise else: log.error("Improper workflow specified: %s"%workflow) return False
#!/usr/bin/env python import argparse, subprocess, glob, os, tempfile, json, setlog, pickle, base64 from numpy import median, mean, floor, round from multiprocessing import cpu_count log = setlog.init(toconsole=True) def mashdist(listfile, reffile, outputfile, cpu=1, maxdist=1.0): cmd = ["mash", "dist", "-d", str(maxdist), reffile, "-l", listfile] if (cpu > 1): cmd = cmd[:2] + ["-p", str(cpu)] + cmd[2:] with open(outputfile + ".temp", "w") as ofil: try: log.info( "MASH_STATUS:: Running MASH ANI estimation on all input genomes" ) subprocess.call(cmd, stdout=ofil) log.debug("MASH_STATUS:: Finished MASH ANI estimation") return True except subprocess.CalledProcessError as e: log.error("MASH_ERROR:: Could not process %s - %s" % (listfile, e)) return False def writefilelist(indir, outdir): try: flist = glob.glob(os.path.join(os.path.realpath(indir), "*.fa")) if len(flist): tf = tempfile.NamedTemporaryFile(prefix="queryflist_", suffix=".txt",
# Lab of Nadine Ziemert, Div. of Microbiology/Biotechnology # Funding by the German Centre for Infection Research (DZIF) # # This file is part of ARTS # ARTS is free software. you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version # # License: You should have received a copy of the GNU General Public License v3 with ARTS # A copy of the GPLv3 can also be found at: <http://www.gnu.org/licenses/>. # import argparse, os, setlog, numpy as np, json, sqlite3 as sql log = setlog.init(toconsole=True, level="info") def getsingles(genemat, pct2=1.0): """Get rows with only single counts""" gmat = np.vstack(genemat.values()) #Store only values of 1 in matrix gmat = ((gmat < 2) * (gmat > 0)).astype(int) thresh = len(gmat[0]) * pct2 counts = gmat.sum(axis=1) #get index of ubiquitus singles and send gene list inds = [i for i, count in enumerate(counts) if count >= thresh] genenames = np.array(genemat.keys()) return list(genenames[inds]), counts
def findsingles(infil, minnum=7, minorg=0.8, maxgenes=30, dnds="", outdir="", indir=".", log=None): if log == None: log = setlog.init(toconsole=True) orgs = [] hks = [] gmat = [] njvals = {} if dnds and os.path.isfile(dnds): with open(dnds, "r") as fil: njvals = json.load(fil) if os.path.isfile(infil): with open(infil, "r") as fil: for line in fil: if line.startswith("#Gene"): orgs.extend(line.strip().split()[5:]) elif not line.startswith("#"): x = line.strip().split() hks.append(x[0]) gmat.append([int(float(v)) for v in x[5:]]) gmat = np.vstack(gmat) numorgs = len(orgs) filtinds = [ i for i, x in enumerate(gmat) if float(list(x).count(1)) / numorgs >= 0.95 ] hkinds = [] orginds = range(numorgs) #Remove organisms with missing/duplicate genes until minimum single copy genes are found while len(hkinds) < minnum and float(len(orginds)) / numorgs >= minorg: hkinds = [ i for i in filtinds if float(list(gmat[i][orginds]).count(1)) == len(orginds) ] if len(hkinds) >= minnum: break tophks = sorted([[x, i, list(x).count(1)] for i, x in enumerate(gmat) if i not in hkinds], key=lambda row: row[2], reverse=True) maxcount = tophks[0][2] tempmat = np.vstack([x[0] for x in tophks if x[2] == maxcount]) orgcounts = sorted([[list(tempmat[:, j]).count(1), j] for j in orginds], key=lambda row: row[0]) orginds = [x[1] for x in orgcounts[numorgs - maxcount:]] remorg = [x for i, x in enumerate(orgs) if i not in orginds] singhks = [x for i, x in enumerate(hks) if i in hkinds] if len(njvals): singhks = sorted(singhks, key=lambda k: njvals.get(k, 9)) mdnds = njvals.get(singhks[:maxgenes][-1], "N/A") log.info("Lowest %s single copy genes found with dNdS < %s" % (min(maxgenes, len(singhks)), mdnds)) if len(hkinds) >= minnum: log.info("# of single copy genes found: %s\t# removed organisms: %s" % (len(singhks), len(remorg))) log.info("Removed orgs:%s\tHKs:%s" % (remorg, singhks[:maxgenes])) if outdir: if not os.path.exists(outdir): os.makedirs(outdir) for hkgn in singhks[:maxgenes]: #Copy all single copy sequences to outdir after removing organisms seqrecs = SeqIO.parse(os.path.join(indir, hkgn + ".fna"), "fasta") # seqrecs = [rec for rec in seqrecs if not any(x in rec.id for x in remorg)] with open(os.path.join(outdir, hkgn + ".fna"), "w") as fnafil, open( os.path.join(outdir, hkgn + ".faa"), "w") as faafil: for rec in seqrecs: if not any(x in rec.id for x in remorg): fnafil.write(">%s\n%s\n" % (rec.id, rec.seq)) faafil.write( ">%s\n%s\n" % (rec.id, rec.seq.translate(to_stop=True))) return remorg, singhks[:maxgenes], mdnds else: log.error( "Failed: Only %s single copy genes found after removing orgs: %s" % (len(singhks), remorg)) log.info("Try different set or allow more deletions") return False, False, False
def findsingles(db, minnum=7, minorg=0.8, maxgenes=30, dnds="", outdir="", keepgenes="", lf=""): global log if lf: log = setlog.init(logfile=lf, level="info") else: log = setlog.init(toconsole=True, level="info") log.info("Getting 16S genes...") seqdict = get16S(db) log.info("Done. Getting Core genes...") orgs, hks, gmat, seqdict = getcoregenes(db, seqdict=seqdict) if len(keepgenes): keepgenes = keepgenes.split(",") else: keepgenes = [] njvals = {} if dnds and os.path.isfile(dnds): with open(dnds, "r") as fil: njvals = json.load(fil) numorgs = len(orgs) filtinds = [ i for i, x in enumerate(gmat) if float(list(x).count(1)) / numorgs >= 0.95 ] filtinds.extend([i for i, x in enumerate(hks) if x in keepgenes]) filtinds = list(set(filtinds)) hkinds = [] #Start with organisms that have all keepgenes orginds = [ i for i in range(numorgs) if all(gmat[hks.index(x), i] == 1 for x in keepgenes) ] hksfound = [] #Remove organisms with missing/duplicate genes until minimum single copy genes are found while float(len(orginds)) / numorgs >= minorg: hkinds = [ i for i in filtinds if float(list(gmat[i][orginds]).count(1)) == len(orginds) ] hksfound = [hks[i] for i in hkinds] if len(hkinds) >= minnum and all(x in hksfound for x in keepgenes): log.info("Found minimum genes") break tophks = sorted([[x, i, list(x).count(1)] for i, x in enumerate(gmat) if i not in hkinds], key=lambda row: row[2], reverse=True) maxcount = tophks[0][2] tempmat = np.vstack([x[0] for x in tophks if x[2] == maxcount]) orgcounts = sorted([[list(tempmat[:, j]).count(1), j] for j in orginds], key=lambda row: row[0]) orginds = [x[1] for x in orgcounts[numorgs - maxcount:]] remorg = [x for i, x in enumerate(orgs) if i not in orginds] singhks = [x for i, x in enumerate(hks) if i in hkinds] #Prioritize by lowest dNdS values singhks = sorted(singhks, key=lambda k: njvals.get(k, 9)) #Move keep genes at the top of list for x in keepgenes: if x in singhks: singhks.insert(0, singhks.pop(singhks.index(x))) if len(hkinds) >= minnum and all(x in hksfound for x in keepgenes): log.info("# of single copy genes found: %s\t# removed organisms: %s" % (len(singhks), len(remorg))) log.info("Removed orgs:%s" % remorg) for i, x in enumerate(singhks): if i < maxgenes: log.info("Top Singles: %s\tdNdS=%s" % (x, njvals.get(x, "NA"))) else: log.info("Other Singles: %s\tdNdS=%s" % (x, njvals.get(x, "NA"))) #WRITE GENES TO FASTA if outdir: if not os.path.exists(outdir): os.makedirs(outdir) for hkgn in singhks[:maxgenes]: allnucrecs = [ ">%s\n%s" % (org, x[hkgn][0][1]) for org, x in seqdict.items() if hkgn in x.keys() and x[hkgn][0][1] and org not in remorg ] allaarecs = [ ">%s\n%s" % (org, x[hkgn][0][2]) for org, x in seqdict.items() if hkgn in x.keys() and x[hkgn][0][2] and org not in remorg ] with open(os.path.join(outdir, hkgn + ".fna"), "w") as fnafil, open( os.path.join(outdir, hkgn + ".faa"), "w") as faafil: if len(allnucrecs): fnafil.write("\n".join(allnucrecs) + "\n") if len(allnucrecs): faafil.write("\n".join(allaarecs) + "\n") return remorg, singhks[:maxgenes] else: log.error( "Failed: Only %s single copy genes found after removing orgs: %s\nKeepgenes found: %s" % (len(singhks), remorg, set(singhks) & set(keepgenes))) log.info("Try different set or allow more deletions") return False, False