Ejemplo n.º 1
0
def startwf1(indir,resultdir,checkpoint=False,concat=False,mashmxdist=0.5,cpu=1,
             skip="",refdb="",hmmdb="",rnadb="",maxmlst=100,model="GTR",bs=0,
             kf=False,maxorg=50,filtMLST=True,fast=False,minmlst=10):
    """WORKFLOW 1: Build phylogeny from scratch"""
    if not checkpoint:
        checkpoint = "w1-0"
    queryseqs = os.path.join(resultdir,"queryseqs")

    #Parse all inputs
    if checkpoint == "w1-0":
        log.info("JOB_STATUS::Parsing all genomes...")
        if parsegenomes.parseall(indir,queryseqs):
            checkpoint = "w1-1"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::5/100")
        else:
            log.error("Problem parsing input genomes")
            return False

    #Run MASH distances
    mashresult = False
    if checkpoint == "w1-1":
        log.info("JOB_STATUS::Running MASH ANI estimation against reference sequences...")
        mashresult = mash.getdistances(queryseqs,resultdir,cpu=cpu,maxdist=mashmxdist)
        if mashresult:
            checkpoint = "w1-2"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::10/100")
        else:
            log.error("MASH distance failed")
            return False

    #Get set of organisms to build seq DB
    selorgs = False
    if checkpoint == "w1-2":
        log.info("JOB_STATUS::Loading mash results...")
        if mashresult:
            pass
        elif os.path.exists(os.path.join(resultdir,"reflist.json")):
            with open(os.path.join(resultdir,"reflist.json"),"r") as fil:
                mashresult = json.load(fil)
                log.info("Loading mash results...")
        else:
            log.error("No Mash results to process")
            return False

        selorgs = getorgs(resultdir,mashresult,skip=skip,IGlimit=maxorg,minorgs=25)
        if selorgs:
            checkpoint = "w1-3"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::15/100")
        else:
            checkpoint = "w1-STEP2"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)

    if checkpoint == "w1-STEP2":
        log.info("JOB_STATUS:: Waiting for selected organisms")
        return "waiting"

    #Copy reference sequence database and add query organisms
    orgdb = os.path.join(resultdir,"refquery.db")
    # selorgs = {}
    if not selorgs and os.path.exists(os.path.join(resultdir,"userlist.json")):
        with open(os.path.join(resultdir,"userlist.json"),"r") as fil:
            selorgs = json.load(fil)
    elif not selorgs and os.path.exists(os.path.join(resultdir,"autoOrglist.json")):
        with open(os.path.join(resultdir,"autoOrglist.json"),"r") as fil:
            selorgs = json.load(fil)
    if checkpoint == "w1-3":
        log.info("JOB_STATUS:: Collating selected genomes...")
        #Clear old db if exists
        if os.path.exists(orgdb):
            os.remove(orgdb)

        flist = list(selorgs["selspecies"])
        flist.extend(selorgs["seloutgroups"])
        flist = [x for x in flist if "query" not in x.lower()]
        log.info("refdb: %s"%refdb)
        if copyseqsql.copydb(flist,refdb,orgdb):
            #get file list of query orgs and add to db
            seqlist = glob.glob(os.path.join(queryseqs,"*.fna"))
            allorgs = makeseqsql.runlist(seqlist,orgdb,True,"",True)
            if allorgs:
                checkpoint = "w1-4"
                log.info("JOB_CHECKPOINT::%s"%checkpoint)
        else:
            log.error("Failed at copydb")

    #Run HMM searches on query organism
    if checkpoint == "w1-4":
        if not orgdb:
            log.error("No querydb rerun at checkpoint w1-3")
            return False
        #Write newly added seqences to file
        naseqs = os.path.join(resultdir,"addedseqs.fna")
        aaseqs = os.path.join(resultdir,"addedseqs.faa")
        seqsql2fa.writefasta(orgdb,aaseqs,False,"",True)
        seqsql2fa.writefasta(orgdb,naseqs,True,"",True)

        #Run HMM searches
        log.info("JOB_STATUS:: Searching for MLST genes in query sequences...")
        log.info("JOB_PROGRESS::25/100")
        if not hmmdb:
            hmmdb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"reducedcore.hmm")
        if not rnadb:
            rnadb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"barnap_bact_rRna.hmm")
        hmmsearch(aaseqs+".domhr",hmmdb,aaseqs,mcpu=cpu)
        hmmsearch(naseqs+".domhr",rnadb,naseqs,mcpu=cpu)

        #Add HMMresults to refdb and cleanup unused seqs
        log.info("Adding query HMM results to database")
        status = makehmmsql.run(aaseqs+".domhr",orgdb)
        if not status:
            #If no genes were found report error
            log.error("No MLST genes could be found. Stop processing")
        log.info("Adding query RNA results to database")
        status = makehmmsql.run(naseqs+".domhr",orgdb,rna=True)
        os.remove(aaseqs)
        os.remove(naseqs)
        checkpoint = "w1-5"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    #Calculate mlst core genes and export
    mlstdir = os.path.join(resultdir,"mlstgenes")
    mlstpriority = os.path.join(resultdir,"mlstpriority.json")
    genematjson = os.path.join(resultdir,"mlstmatrix.json")
    dndsfile = os.path.join(os.path.dirname(os.path.realpath(__file__)),"dnds.json")
    mlstselection = []
    delorgs = []
    if checkpoint == "w1-5":
        allquery = [os.path.splitext(os.path.split(x)[-1])[0].replace(" ","_") for x in glob.glob(os.path.join(resultdir,"queryseqs","*.fna"))]
        ignoreorgs = list(selorgs.get("seloutgroups",[]))
        ignoreorgs.extend(allquery)

        #getmlstgenes.findsingles(orgdb,maxgenes=500,outdir=mlstdir)
        if os.path.exists(mlstpriority) and os.path.exists(genematjson):
            with open(genematjson,"r") as mfil, open(mlstpriority,"r") as pfil:
                mlstpriority = json.load(pfil)
                temp = json.load(mfil)
                genemat = temp["counts"]
                orgs = temp["orgs"]
                del temp
        else:
            genemat,orgs,mlstpriority = getgenematrix.getmat(orgdb,pct=0.5,pct2=1.0,bh=True,rna=True,savefil=genematjson,prifile=mlstpriority,dndsfile=dndsfile,ignoreorgs=allquery)
        mlstselection, delorgs, concat = getmlstselection(resultdir,mlstpriority,maxmlst,ignoreorgs=ignoreorgs,concat=concat,minmlst=minmlst)
        if "skip3" in skip.lower() or os.path.exists(os.path.join(resultdir,"usergenes.json")):
            #Export selected genes to mlst folder
            log.info("JOB_STATUS:: Writing MLST genes...")
            getgenes.writeallgenes(orgdb,mlstselection,delorgs,outdir=mlstdir,outgroups=selorgs.get("seloutgroups",None),pct=0.5,rename=True)
            checkpoint = "w1-6"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
        else:
            checkpoint = "w1-STEP3"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)

    if checkpoint == "w1-STEP3":
        log.info("JOB_STATUS:: Waiting for selected MLST genes")
        return "waiting"

    ## Align and trim all MLST genes
    aligndir = os.path.join(resultdir,"mlst_aligned")
    trimdir = os.path.join(resultdir,"mlst_trimmed")

    if checkpoint == "w1-6":
        log.info("JOB_STATUS:: Aligning MLST genes")
        log.info("JOB_PROGRESS::30/100")
        #align all
        processmlst(mlstdir,aligndir,cpu=cpu,fast=fast)
        checkpoint = "w1-6b"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    if checkpoint == "w1-6b":
        if filtMLST and filtMLST != "False":
            #Extra screen of MLST genes to remove outliers based on starting tree distance
            log.info("JOB_STATUS:: Screening for inconsistent MLST genes")
            log.info("JOB_PROGRESS::55/100")
            excludemlst = screenmlst(mlstdir,aligndir,cpu=cpu)
            log.info("JOB_STATUS:: Excluded MLST genes: %s"%excludemlst)

        #trim all
        log.info("JOB_STATUS:: Trimming alignments")
        log.info("JOB_PROGRESS::65/100")
        processmlst(aligndir,trimdir,cpu=cpu,trim=True)

        checkpoint = "w1-7"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    treedir = os.path.join(resultdir,"trees")
    finishedtree = ""
    #Build trees
    if checkpoint == "w1-7":
        log.info("JOB_PROGRESS::75/100")
        if concat and concat != "False":
            log.info("JOB_STATUS:: Running concatenated supermatrix phylogeny")
            concatfasta = os.path.join(resultdir,"concatMLST.fasta")
            partfile = os.path.join(resultdir,"nucpartition.txt")
            concatphylogeny(resultdir, concatfasta, partfile,cpu=cpu,model=model,bs=bs)
            checkpoint = "w1-F"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
        else:
            log.info("JOB_STATUS:: Running coalescent tree phylogeny")
            colphylogeny(resultdir,trimdir,cpu=cpu,model=model,bs=bs)
            checkpoint = "w1-F"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)

    if checkpoint == "w1-F":
        finaltree = os.path.join(resultdir,"final.tree")
        if os.path.exists(os.path.join(treedir,"concatTree.tree.treefile")) and concat:
            finishedtree = os.path.join(treedir,"concatTree.tree.treefile")
        elif os.path.exists(os.path.join(treedir,"summaryTree.tree")):
            finishedtree = os.path.join(treedir,"summaryTree.tree")
        #Copy final tree to root dir
        if finishedtree:
            fmat = 2 if bs or not concat else 5
            log.debug("Saving final tree... %s"%fmat)
            if not ete3helper.rerootTree(finishedtree,finaltree,fmat=fmat):
                shutil.copy(finishedtree,finaltree)
        else:
            log.error("Could not find final tree. Tree building failed")

        #Job finished? do some cleanup
        if not kf:
            if os.path.exists(orgdb):
                os.remove(orgdb)
            shutil.rmtree(os.path.join(resultdir,"queryseqs"))
            shutil.rmtree(os.path.join(resultdir,"mlst_trimmed"))
            shutil.rmtree(os.path.join(resultdir,"mlstgenes"))
            for oldfil in glob.glob(os.path.join(treedir,"*.model")):
                os.remove(oldfil)
Ejemplo n.º 2
0
def startwf2(indir,resultdir,refdir="",checkpoint=False,reference="",model="GTR",bs=0,kf=False,maxmlst=100,
             cpu=1,mashmxdist=0.5,rnadb="",hmmdb="",fast=False):
    """WORKFLOW 2: Get all query genomes and identify reference tree to add sequences to"""
    if not checkpoint:
        checkpoint = "w2-0"
    if reference:
        checkpoint = "w2-3"
    queryseqs = os.path.join(resultdir,"queryseqs")

    #Parse all inputs
    if checkpoint == "w2-0":
        log.info("JOB_STATUS::Parsing all genomes...")
        if parsegenomes.parseall(indir,queryseqs):
            checkpoint = "w2-1"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::5/100")
        else:
            log.error("Problem parsing input genomes")
            return False

    #Run MASH distances
    mashresult = False
    if checkpoint == "w2-1":
        log.info("JOB_STATUS::Running MASH ANI estimation against reference sequences...")
        mashresult = mash.getdistances(queryseqs,resultdir,cpu=cpu,maxdist=mashmxdist)
        if mashresult:
            checkpoint = "w2-2"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::10/100")
        else:
            log.error("MASH distance failed")
            return False

    #Detect reference group
    if checkpoint == "w2-2":
        log.info("JOB_STATUS::Loading mash results...")
        if mashresult:
            pass
        elif os.path.exists(os.path.join(resultdir,"reflist.json")):
            with open(os.path.join(resultdir,"reflist.json"),"r") as fil:
                mashresult = json.load(fil)
                log.info("Loading mash results...")
        else:
            log.error("No Mash results to process")
            return False

        reference = getreference(mashresult,refdir)
        if reference:
            checkpoint = "w2-3"
            log.info("JOB_STATUS::Detected reference = %s"%reference)
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::15/100")
        else:
            checkpoint = "w2-F"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.error("No matching reference found for all query organisms")

    orgdb = os.path.join(resultdir,"allseqs.db")
    seqlist = glob.glob(os.path.join(queryseqs,"*.fna"))
    #Add to db and run HMM searches. Use model in reference folder, or use global models
    if checkpoint == "w2-3":
        #get file list of query orgs and add to db
        allorgs = makeseqsql.runlist(seqlist,orgdb,True,"",False)

        #Write newly added seqences to file
        naseqs = os.path.join(resultdir,"addedseqs.fna")
        aaseqs = os.path.join(resultdir,"addedseqs.faa")
        seqsql2fa.writefasta(orgdb,aaseqs,False,"",True)
        seqsql2fa.writefasta(orgdb,naseqs,True,"",True)

        #Run HMM searches
        log.info("JOB_STATUS:: Searching for MLST genes in query sequences...")
        log.info("JOB_PROGRESS::25/100")

        if not hmmdb:
            if os.path.exists(os.path.join(refdir,reference,"core.hmm")):
                hmmdb = os.path.join(refdir,reference,"core.hmm")
            else:
                hmmdb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"reducedcore.hmm")
        if not rnadb:
            rnadb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"barnap_bact_rRna.hmm")

        hmmsearch(aaseqs+".domhr",hmmdb,aaseqs,mcpu=cpu)
        hmmsearch(naseqs+".domhr",rnadb,naseqs,mcpu=cpu)

        #Add HMMresults
        log.info("Adding query HMM results to database")
        status = makehmmsql.run(aaseqs+".domhr",orgdb)
        if not status:
            #If no genes were found report error
            log.error("No MLST genes could be found. Stop processing")
        log.info("Adding query RNA results to database")
        status = makehmmsql.run(naseqs+".domhr",orgdb,rna=True)
        os.remove(aaseqs)
        os.remove(naseqs)
        checkpoint = "w2-4"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    mlstdir = os.path.realpath(os.path.join(resultdir,"mlstgenes"))
    if not os.path.exists(mlstdir):
        os.makedirs(mlstdir)
    aligndir = os.path.realpath(os.path.join(resultdir,"mlst_aligned"))
    if not os.path.exists(aligndir):
        os.makedirs(aligndir)
    trimdir = os.path.realpath(os.path.join(resultdir,"mlst_trimmed"))
    if not os.path.exists(trimdir):
        os.makedirs(trimdir)

    # extract single copy results from genelist
    if checkpoint == "w2-4":
        genelist = [os.path.splitext(os.path.split(x)[1])[0] for x in glob.glob(os.path.join(refdir,reference,"*.fna"))]
        log.info("JOB_STATUS:: Writing MLST genes...")
        singles = [str(x) for x in getgenematrix.getmat(orgdb,rna=True,bh=True)[2][0] if x in genelist]
        getgenes.writeallgenes(orgdb,singles,[],outdir=mlstdir,outgroups=None,pct=0.5,writeaa=False,rename=True)

        log.info("JOB_STATUS:: Adding MLST genes to alignments...")
        addallalign([os.path.join(mlstdir,x+".fna") for x in singles], os.path.join(refdir,reference), aligndir, cpu=cpu,fast=fast)

        log.info("JOB_STATUS:: Trimming all alignments...")
        processmlst(aligndir,trimdir,cpu=cpu,trim=True)
        checkpoint = "w2-5"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    treedir = os.path.realpath(os.path.join(resultdir,"trees"))
    if not os.path.exists(treedir):
        os.makedirs(treedir)

    #Add to prebuilt trees
    if checkpoint == "w2-5":
        log.info("JOB_PROGRESS::55/100")
        log.info("JOB_STATUS:: Adding to reference gene trees...")
        inlist = [os.path.join(trimdir,x) for x in os.listdir(trimdir)]
        addalltrees(inlist,os.path.join(refdir,reference),treedir,cpu=cpu)
        checkpoint = "w2-6"
        log.info("JOB_CHECKPOINT::%s" % checkpoint)

    if checkpoint == "w2-6":
        log.info("JOB_PROGRESS::85/100")
        log.info("JOB_STATUS:: Running coalescent tree phylogeny")
        #Combine all trees using ASTRAL
        flist = glob.glob(os.path.join(treedir,"RAxML_labelledTree.*.tree"))
        #reformat newick trees for astral
        for pbt in flist:
            with open(pbt,"r") as ifil, open(pbt+".newick","w") as ofil:
                x = ifil.next()
                ofil.write(re.sub("\[I\d+?\]|\"|'|QUERY___","",x))
        flist = glob.glob(os.path.join(treedir,"*.newick"))

        alltrees = catTrees(flist,os.path.join(treedir,"alltrees.tree"))
        coltree = os.path.join(treedir,"summaryTree.tree")
        runAstral(resultdir,alltrees,coltree)

        checkpoint = "w2-F"
        log.info("JOB_CHECKPOINT::%s" % checkpoint)

    finishedtree=""
    if checkpoint == "w2-F":
        finaltree = os.path.join(resultdir,"final.tree")
        if os.path.exists(os.path.join(treedir,"summaryTree.tree")):
            finishedtree = os.path.join(treedir,"summaryTree.tree")
        #Copy final tree to root dir
        if finishedtree:
            fmat = 2
            log.debug("Saving final tree... %s"%fmat)
            if not ete3helper.rerootTree(finishedtree,finaltree,fmat=fmat):
                shutil.copy(finishedtree,finaltree)
        else:
            log.error("Could not find final tree. Tree building failed")
Ejemplo n.º 3
0
def startUPLBwf(indir,
                resultdir,
                checkpoint=False,
                concat=False,
                mashmxdist=0.5,
                cpu=1,
                skip="",
                refdb="",
                hmmdb="",
                rnadb="",
                maxmlst=100,
                model="GTR",
                bs=0,
                kf=False,
                maxorg=50,
                filtMLST=True,
                fast=False,
                minmlst=10):
    """WORKFLOW 1: Build phylogeny from scratch"""
    if not checkpoint:
        checkpoint = "w1-0"
    queryseqs = os.path.join(resultdir, "queryseqs")

    #Parse all inputs
    if checkpoint == "w1-0":
        log.info("JOB_STATUS::Parsing all genomes...")
        if parsegenomes.parseall(indir, queryseqs):
            checkpoint = "w1-1"
            log.info("JOB_CHECKPOINT::%s" % checkpoint)
            log.info("JOB_PROGRESS::5/100")
        else:
            log.error("Problem parsing input genomes")
            return False

    #Run MASH distances
    mashresults = False
    if checkpoint == "w1-1":
        for idx, queryseq in enumerate(queryseqs):
            log.info(
                "JOB_STATUS::Running MASH ANI estimation against reference sequences...{}"
                .format(len(queryseqs)))
            mashresult = mash.getdistances([queryseq],
                                           resultdir,
                                           cpu=cpu,
                                           maxdist=mashmxdist)
            if mashresult:
                #        checkpoint = "w1-2"
                log.info("JOB_CHECKPOINT::%s" % checkpoint)
                log.info("JOB_PROGRESS::Seq {} of {}".format(
                    idx + 1, len(queryseqs)))
                mashresults.append(mashresult)
            else:
                log.error("MASH distance failed")
                return False

    #Get set of organisms to build seq DB
    selorgDict = False
    if checkpoint == "w1-2":
        log.info("JOB_STATUS::Loading mash results...")
        for mashresult in mashresults:
            selorgs = getorgs(resultdir,
                              mashresult,
                              skip=skip,
                              IGlimit=maxorg,
                              minorgs=25)
        selorgDict = zip(queryseqs, )
    with open(autosel, "w") as fil:
        json.dump(selection, fil)

    print(selorgs)