def startwf1(indir,resultdir,checkpoint=False,concat=False,mashmxdist=0.5,cpu=1, skip="",refdb="",hmmdb="",rnadb="",maxmlst=100,model="GTR",bs=0, kf=False,maxorg=50,filtMLST=True,fast=False,minmlst=10): """WORKFLOW 1: Build phylogeny from scratch""" if not checkpoint: checkpoint = "w1-0" queryseqs = os.path.join(resultdir,"queryseqs") #Parse all inputs if checkpoint == "w1-0": log.info("JOB_STATUS::Parsing all genomes...") if parsegenomes.parseall(indir,queryseqs): checkpoint = "w1-1" log.info("JOB_CHECKPOINT::%s"%checkpoint) log.info("JOB_PROGRESS::5/100") else: log.error("Problem parsing input genomes") return False #Run MASH distances mashresult = False if checkpoint == "w1-1": log.info("JOB_STATUS::Running MASH ANI estimation against reference sequences...") mashresult = mash.getdistances(queryseqs,resultdir,cpu=cpu,maxdist=mashmxdist) if mashresult: checkpoint = "w1-2" log.info("JOB_CHECKPOINT::%s"%checkpoint) log.info("JOB_PROGRESS::10/100") else: log.error("MASH distance failed") return False #Get set of organisms to build seq DB selorgs = False if checkpoint == "w1-2": log.info("JOB_STATUS::Loading mash results...") if mashresult: pass elif os.path.exists(os.path.join(resultdir,"reflist.json")): with open(os.path.join(resultdir,"reflist.json"),"r") as fil: mashresult = json.load(fil) log.info("Loading mash results...") else: log.error("No Mash results to process") return False selorgs = getorgs(resultdir,mashresult,skip=skip,IGlimit=maxorg,minorgs=25) if selorgs: checkpoint = "w1-3" log.info("JOB_CHECKPOINT::%s"%checkpoint) log.info("JOB_PROGRESS::15/100") else: checkpoint = "w1-STEP2" log.info("JOB_CHECKPOINT::%s"%checkpoint) if checkpoint == "w1-STEP2": log.info("JOB_STATUS:: Waiting for selected organisms") return "waiting" #Copy reference sequence database and add query organisms orgdb = os.path.join(resultdir,"refquery.db") # selorgs = {} if not selorgs and os.path.exists(os.path.join(resultdir,"userlist.json")): with open(os.path.join(resultdir,"userlist.json"),"r") as fil: selorgs = json.load(fil) elif not selorgs and os.path.exists(os.path.join(resultdir,"autoOrglist.json")): with open(os.path.join(resultdir,"autoOrglist.json"),"r") as fil: selorgs = json.load(fil) if checkpoint == "w1-3": log.info("JOB_STATUS:: Collating selected genomes...") #Clear old db if exists if os.path.exists(orgdb): os.remove(orgdb) flist = list(selorgs["selspecies"]) flist.extend(selorgs["seloutgroups"]) flist = [x for x in flist if "query" not in x.lower()] log.info("refdb: %s"%refdb) if copyseqsql.copydb(flist,refdb,orgdb): #get file list of query orgs and add to db seqlist = glob.glob(os.path.join(queryseqs,"*.fna")) allorgs = makeseqsql.runlist(seqlist,orgdb,True,"",True) if allorgs: checkpoint = "w1-4" log.info("JOB_CHECKPOINT::%s"%checkpoint) else: log.error("Failed at copydb") #Run HMM searches on query organism if checkpoint == "w1-4": if not orgdb: log.error("No querydb rerun at checkpoint w1-3") return False #Write newly added seqences to file naseqs = os.path.join(resultdir,"addedseqs.fna") aaseqs = os.path.join(resultdir,"addedseqs.faa") seqsql2fa.writefasta(orgdb,aaseqs,False,"",True) seqsql2fa.writefasta(orgdb,naseqs,True,"",True) #Run HMM searches log.info("JOB_STATUS:: Searching for MLST genes in query sequences...") log.info("JOB_PROGRESS::25/100") if not hmmdb: hmmdb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"reducedcore.hmm") if not rnadb: rnadb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"barnap_bact_rRna.hmm") hmmsearch(aaseqs+".domhr",hmmdb,aaseqs,mcpu=cpu) hmmsearch(naseqs+".domhr",rnadb,naseqs,mcpu=cpu) #Add HMMresults to refdb and cleanup unused seqs log.info("Adding query HMM results to database") status = makehmmsql.run(aaseqs+".domhr",orgdb) if not status: #If no genes were found report error log.error("No MLST genes could be found. Stop processing") log.info("Adding query RNA results to database") status = makehmmsql.run(naseqs+".domhr",orgdb,rna=True) os.remove(aaseqs) os.remove(naseqs) checkpoint = "w1-5" log.info("JOB_CHECKPOINT::%s"%checkpoint) #Calculate mlst core genes and export mlstdir = os.path.join(resultdir,"mlstgenes") mlstpriority = os.path.join(resultdir,"mlstpriority.json") genematjson = os.path.join(resultdir,"mlstmatrix.json") dndsfile = os.path.join(os.path.dirname(os.path.realpath(__file__)),"dnds.json") mlstselection = [] delorgs = [] if checkpoint == "w1-5": allquery = [os.path.splitext(os.path.split(x)[-1])[0].replace(" ","_") for x in glob.glob(os.path.join(resultdir,"queryseqs","*.fna"))] ignoreorgs = list(selorgs.get("seloutgroups",[])) ignoreorgs.extend(allquery) #getmlstgenes.findsingles(orgdb,maxgenes=500,outdir=mlstdir) if os.path.exists(mlstpriority) and os.path.exists(genematjson): with open(genematjson,"r") as mfil, open(mlstpriority,"r") as pfil: mlstpriority = json.load(pfil) temp = json.load(mfil) genemat = temp["counts"] orgs = temp["orgs"] del temp else: genemat,orgs,mlstpriority = getgenematrix.getmat(orgdb,pct=0.5,pct2=1.0,bh=True,rna=True,savefil=genematjson,prifile=mlstpriority,dndsfile=dndsfile,ignoreorgs=allquery) mlstselection, delorgs, concat = getmlstselection(resultdir,mlstpriority,maxmlst,ignoreorgs=ignoreorgs,concat=concat,minmlst=minmlst) if "skip3" in skip.lower() or os.path.exists(os.path.join(resultdir,"usergenes.json")): #Export selected genes to mlst folder log.info("JOB_STATUS:: Writing MLST genes...") getgenes.writeallgenes(orgdb,mlstselection,delorgs,outdir=mlstdir,outgroups=selorgs.get("seloutgroups",None),pct=0.5,rename=True) checkpoint = "w1-6" log.info("JOB_CHECKPOINT::%s"%checkpoint) else: checkpoint = "w1-STEP3" log.info("JOB_CHECKPOINT::%s"%checkpoint) if checkpoint == "w1-STEP3": log.info("JOB_STATUS:: Waiting for selected MLST genes") return "waiting" ## Align and trim all MLST genes aligndir = os.path.join(resultdir,"mlst_aligned") trimdir = os.path.join(resultdir,"mlst_trimmed") if checkpoint == "w1-6": log.info("JOB_STATUS:: Aligning MLST genes") log.info("JOB_PROGRESS::30/100") #align all processmlst(mlstdir,aligndir,cpu=cpu,fast=fast) checkpoint = "w1-6b" log.info("JOB_CHECKPOINT::%s"%checkpoint) if checkpoint == "w1-6b": if filtMLST and filtMLST != "False": #Extra screen of MLST genes to remove outliers based on starting tree distance log.info("JOB_STATUS:: Screening for inconsistent MLST genes") log.info("JOB_PROGRESS::55/100") excludemlst = screenmlst(mlstdir,aligndir,cpu=cpu) log.info("JOB_STATUS:: Excluded MLST genes: %s"%excludemlst) #trim all log.info("JOB_STATUS:: Trimming alignments") log.info("JOB_PROGRESS::65/100") processmlst(aligndir,trimdir,cpu=cpu,trim=True) checkpoint = "w1-7" log.info("JOB_CHECKPOINT::%s"%checkpoint) treedir = os.path.join(resultdir,"trees") finishedtree = "" #Build trees if checkpoint == "w1-7": log.info("JOB_PROGRESS::75/100") if concat and concat != "False": log.info("JOB_STATUS:: Running concatenated supermatrix phylogeny") concatfasta = os.path.join(resultdir,"concatMLST.fasta") partfile = os.path.join(resultdir,"nucpartition.txt") concatphylogeny(resultdir, concatfasta, partfile,cpu=cpu,model=model,bs=bs) checkpoint = "w1-F" log.info("JOB_CHECKPOINT::%s"%checkpoint) else: log.info("JOB_STATUS:: Running coalescent tree phylogeny") colphylogeny(resultdir,trimdir,cpu=cpu,model=model,bs=bs) checkpoint = "w1-F" log.info("JOB_CHECKPOINT::%s"%checkpoint) if checkpoint == "w1-F": finaltree = os.path.join(resultdir,"final.tree") if os.path.exists(os.path.join(treedir,"concatTree.tree.treefile")) and concat: finishedtree = os.path.join(treedir,"concatTree.tree.treefile") elif os.path.exists(os.path.join(treedir,"summaryTree.tree")): finishedtree = os.path.join(treedir,"summaryTree.tree") #Copy final tree to root dir if finishedtree: fmat = 2 if bs or not concat else 5 log.debug("Saving final tree... %s"%fmat) if not ete3helper.rerootTree(finishedtree,finaltree,fmat=fmat): shutil.copy(finishedtree,finaltree) else: log.error("Could not find final tree. Tree building failed") #Job finished? do some cleanup if not kf: if os.path.exists(orgdb): os.remove(orgdb) shutil.rmtree(os.path.join(resultdir,"queryseqs")) shutil.rmtree(os.path.join(resultdir,"mlst_trimmed")) shutil.rmtree(os.path.join(resultdir,"mlstgenes")) for oldfil in glob.glob(os.path.join(treedir,"*.model")): os.remove(oldfil)
def startwf2(indir,resultdir,refdir="",checkpoint=False,reference="",model="GTR",bs=0,kf=False,maxmlst=100, cpu=1,mashmxdist=0.5,rnadb="",hmmdb="",fast=False): """WORKFLOW 2: Get all query genomes and identify reference tree to add sequences to""" if not checkpoint: checkpoint = "w2-0" if reference: checkpoint = "w2-3" queryseqs = os.path.join(resultdir,"queryseqs") #Parse all inputs if checkpoint == "w2-0": log.info("JOB_STATUS::Parsing all genomes...") if parsegenomes.parseall(indir,queryseqs): checkpoint = "w2-1" log.info("JOB_CHECKPOINT::%s"%checkpoint) log.info("JOB_PROGRESS::5/100") else: log.error("Problem parsing input genomes") return False #Run MASH distances mashresult = False if checkpoint == "w2-1": log.info("JOB_STATUS::Running MASH ANI estimation against reference sequences...") mashresult = mash.getdistances(queryseqs,resultdir,cpu=cpu,maxdist=mashmxdist) if mashresult: checkpoint = "w2-2" log.info("JOB_CHECKPOINT::%s"%checkpoint) log.info("JOB_PROGRESS::10/100") else: log.error("MASH distance failed") return False #Detect reference group if checkpoint == "w2-2": log.info("JOB_STATUS::Loading mash results...") if mashresult: pass elif os.path.exists(os.path.join(resultdir,"reflist.json")): with open(os.path.join(resultdir,"reflist.json"),"r") as fil: mashresult = json.load(fil) log.info("Loading mash results...") else: log.error("No Mash results to process") return False reference = getreference(mashresult,refdir) if reference: checkpoint = "w2-3" log.info("JOB_STATUS::Detected reference = %s"%reference) log.info("JOB_CHECKPOINT::%s"%checkpoint) log.info("JOB_PROGRESS::15/100") else: checkpoint = "w2-F" log.info("JOB_CHECKPOINT::%s"%checkpoint) log.error("No matching reference found for all query organisms") orgdb = os.path.join(resultdir,"allseqs.db") seqlist = glob.glob(os.path.join(queryseqs,"*.fna")) #Add to db and run HMM searches. Use model in reference folder, or use global models if checkpoint == "w2-3": #get file list of query orgs and add to db allorgs = makeseqsql.runlist(seqlist,orgdb,True,"",False) #Write newly added seqences to file naseqs = os.path.join(resultdir,"addedseqs.fna") aaseqs = os.path.join(resultdir,"addedseqs.faa") seqsql2fa.writefasta(orgdb,aaseqs,False,"",True) seqsql2fa.writefasta(orgdb,naseqs,True,"",True) #Run HMM searches log.info("JOB_STATUS:: Searching for MLST genes in query sequences...") log.info("JOB_PROGRESS::25/100") if not hmmdb: if os.path.exists(os.path.join(refdir,reference,"core.hmm")): hmmdb = os.path.join(refdir,reference,"core.hmm") else: hmmdb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"reducedcore.hmm") if not rnadb: rnadb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"barnap_bact_rRna.hmm") hmmsearch(aaseqs+".domhr",hmmdb,aaseqs,mcpu=cpu) hmmsearch(naseqs+".domhr",rnadb,naseqs,mcpu=cpu) #Add HMMresults log.info("Adding query HMM results to database") status = makehmmsql.run(aaseqs+".domhr",orgdb) if not status: #If no genes were found report error log.error("No MLST genes could be found. Stop processing") log.info("Adding query RNA results to database") status = makehmmsql.run(naseqs+".domhr",orgdb,rna=True) os.remove(aaseqs) os.remove(naseqs) checkpoint = "w2-4" log.info("JOB_CHECKPOINT::%s"%checkpoint) mlstdir = os.path.realpath(os.path.join(resultdir,"mlstgenes")) if not os.path.exists(mlstdir): os.makedirs(mlstdir) aligndir = os.path.realpath(os.path.join(resultdir,"mlst_aligned")) if not os.path.exists(aligndir): os.makedirs(aligndir) trimdir = os.path.realpath(os.path.join(resultdir,"mlst_trimmed")) if not os.path.exists(trimdir): os.makedirs(trimdir) # extract single copy results from genelist if checkpoint == "w2-4": genelist = [os.path.splitext(os.path.split(x)[1])[0] for x in glob.glob(os.path.join(refdir,reference,"*.fna"))] log.info("JOB_STATUS:: Writing MLST genes...") singles = [str(x) for x in getgenematrix.getmat(orgdb,rna=True,bh=True)[2][0] if x in genelist] getgenes.writeallgenes(orgdb,singles,[],outdir=mlstdir,outgroups=None,pct=0.5,writeaa=False,rename=True) log.info("JOB_STATUS:: Adding MLST genes to alignments...") addallalign([os.path.join(mlstdir,x+".fna") for x in singles], os.path.join(refdir,reference), aligndir, cpu=cpu,fast=fast) log.info("JOB_STATUS:: Trimming all alignments...") processmlst(aligndir,trimdir,cpu=cpu,trim=True) checkpoint = "w2-5" log.info("JOB_CHECKPOINT::%s"%checkpoint) treedir = os.path.realpath(os.path.join(resultdir,"trees")) if not os.path.exists(treedir): os.makedirs(treedir) #Add to prebuilt trees if checkpoint == "w2-5": log.info("JOB_PROGRESS::55/100") log.info("JOB_STATUS:: Adding to reference gene trees...") inlist = [os.path.join(trimdir,x) for x in os.listdir(trimdir)] addalltrees(inlist,os.path.join(refdir,reference),treedir,cpu=cpu) checkpoint = "w2-6" log.info("JOB_CHECKPOINT::%s" % checkpoint) if checkpoint == "w2-6": log.info("JOB_PROGRESS::85/100") log.info("JOB_STATUS:: Running coalescent tree phylogeny") #Combine all trees using ASTRAL flist = glob.glob(os.path.join(treedir,"RAxML_labelledTree.*.tree")) #reformat newick trees for astral for pbt in flist: with open(pbt,"r") as ifil, open(pbt+".newick","w") as ofil: x = ifil.next() ofil.write(re.sub("\[I\d+?\]|\"|'|QUERY___","",x)) flist = glob.glob(os.path.join(treedir,"*.newick")) alltrees = catTrees(flist,os.path.join(treedir,"alltrees.tree")) coltree = os.path.join(treedir,"summaryTree.tree") runAstral(resultdir,alltrees,coltree) checkpoint = "w2-F" log.info("JOB_CHECKPOINT::%s" % checkpoint) finishedtree="" if checkpoint == "w2-F": finaltree = os.path.join(resultdir,"final.tree") if os.path.exists(os.path.join(treedir,"summaryTree.tree")): finishedtree = os.path.join(treedir,"summaryTree.tree") #Copy final tree to root dir if finishedtree: fmat = 2 log.debug("Saving final tree... %s"%fmat) if not ete3helper.rerootTree(finishedtree,finaltree,fmat=fmat): shutil.copy(finishedtree,finaltree) else: log.error("Could not find final tree. Tree building failed")
def startUPLBwf(indir, resultdir, checkpoint=False, concat=False, mashmxdist=0.5, cpu=1, skip="", refdb="", hmmdb="", rnadb="", maxmlst=100, model="GTR", bs=0, kf=False, maxorg=50, filtMLST=True, fast=False, minmlst=10): """WORKFLOW 1: Build phylogeny from scratch""" if not checkpoint: checkpoint = "w1-0" queryseqs = os.path.join(resultdir, "queryseqs") #Parse all inputs if checkpoint == "w1-0": log.info("JOB_STATUS::Parsing all genomes...") if parsegenomes.parseall(indir, queryseqs): checkpoint = "w1-1" log.info("JOB_CHECKPOINT::%s" % checkpoint) log.info("JOB_PROGRESS::5/100") else: log.error("Problem parsing input genomes") return False #Run MASH distances mashresults = False if checkpoint == "w1-1": for idx, queryseq in enumerate(queryseqs): log.info( "JOB_STATUS::Running MASH ANI estimation against reference sequences...{}" .format(len(queryseqs))) mashresult = mash.getdistances([queryseq], resultdir, cpu=cpu, maxdist=mashmxdist) if mashresult: # checkpoint = "w1-2" log.info("JOB_CHECKPOINT::%s" % checkpoint) log.info("JOB_PROGRESS::Seq {} of {}".format( idx + 1, len(queryseqs))) mashresults.append(mashresult) else: log.error("MASH distance failed") return False #Get set of organisms to build seq DB selorgDict = False if checkpoint == "w1-2": log.info("JOB_STATUS::Loading mash results...") for mashresult in mashresults: selorgs = getorgs(resultdir, mashresult, skip=skip, IGlimit=maxorg, minorgs=25) selorgDict = zip(queryseqs, ) with open(autosel, "w") as fil: json.dump(selection, fil) print(selorgs)