Python getdistances Examples

Programming Language: Python

Namespace/Package Name: mash

Method/Function: getdistances

Examples at hotexamples.com: 3

Python getdistances - 3 examples found. These are the top rated real world Python examples of mash.getdistances extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def startwf1(indir,resultdir,checkpoint=False,concat=False,mashmxdist=0.5,cpu=1,
             skip="",refdb="",hmmdb="",rnadb="",maxmlst=100,model="GTR",bs=0,
             kf=False,maxorg=50,filtMLST=True,fast=False,minmlst=10):
    """WORKFLOW 1: Build phylogeny from scratch"""
    if not checkpoint:
        checkpoint = "w1-0"
    queryseqs = os.path.join(resultdir,"queryseqs")

    #Parse all inputs
    if checkpoint == "w1-0":
        log.info("JOB_STATUS::Parsing all genomes...")
        if parsegenomes.parseall(indir,queryseqs):
            checkpoint = "w1-1"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::5/100")
        else:
            log.error("Problem parsing input genomes")
            return False

    #Run MASH distances
    mashresult = False
    if checkpoint == "w1-1":
        log.info("JOB_STATUS::Running MASH ANI estimation against reference sequences...")
        mashresult = mash.getdistances(queryseqs,resultdir,cpu=cpu,maxdist=mashmxdist)
        if mashresult:
            checkpoint = "w1-2"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::10/100")
        else:
            log.error("MASH distance failed")
            return False

    #Get set of organisms to build seq DB
    selorgs = False
    if checkpoint == "w1-2":
        log.info("JOB_STATUS::Loading mash results...")
        if mashresult:
            pass
        elif os.path.exists(os.path.join(resultdir,"reflist.json")):
            with open(os.path.join(resultdir,"reflist.json"),"r") as fil:
                mashresult = json.load(fil)
                log.info("Loading mash results...")
        else:
            log.error("No Mash results to process")
            return False

        selorgs = getorgs(resultdir,mashresult,skip=skip,IGlimit=maxorg,minorgs=25)
        if selorgs:
            checkpoint = "w1-3"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::15/100")
        else:
            checkpoint = "w1-STEP2"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)

    if checkpoint == "w1-STEP2":
        log.info("JOB_STATUS:: Waiting for selected organisms")
        return "waiting"

    #Copy reference sequence database and add query organisms
    orgdb = os.path.join(resultdir,"refquery.db")
    # selorgs = {}
    if not selorgs and os.path.exists(os.path.join(resultdir,"userlist.json")):
        with open(os.path.join(resultdir,"userlist.json"),"r") as fil:
            selorgs = json.load(fil)
    elif not selorgs and os.path.exists(os.path.join(resultdir,"autoOrglist.json")):
        with open(os.path.join(resultdir,"autoOrglist.json"),"r") as fil:
            selorgs = json.load(fil)
    if checkpoint == "w1-3":
        log.info("JOB_STATUS:: Collating selected genomes...")
        #Clear old db if exists
        if os.path.exists(orgdb):
            os.remove(orgdb)

        flist = list(selorgs["selspecies"])
        flist.extend(selorgs["seloutgroups"])
        flist = [x for x in flist if "query" not in x.lower()]
        log.info("refdb: %s"%refdb)
        if copyseqsql.copydb(flist,refdb,orgdb):
            #get file list of query orgs and add to db
            seqlist = glob.glob(os.path.join(queryseqs,"*.fna"))
            allorgs = makeseqsql.runlist(seqlist,orgdb,True,"",True)
            if allorgs:
                checkpoint = "w1-4"
                log.info("JOB_CHECKPOINT::%s"%checkpoint)
        else:
            log.error("Failed at copydb")

    #Run HMM searches on query organism
    if checkpoint == "w1-4":
        if not orgdb:
            log.error("No querydb rerun at checkpoint w1-3")
            return False
        #Write newly added seqences to file
        naseqs = os.path.join(resultdir,"addedseqs.fna")
        aaseqs = os.path.join(resultdir,"addedseqs.faa")
        seqsql2fa.writefasta(orgdb,aaseqs,False,"",True)
        seqsql2fa.writefasta(orgdb,naseqs,True,"",True)

        #Run HMM searches
        log.info("JOB_STATUS:: Searching for MLST genes in query sequences...")
        log.info("JOB_PROGRESS::25/100")
        if not hmmdb:
            hmmdb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"reducedcore.hmm")
        if not rnadb:
            rnadb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"barnap_bact_rRna.hmm")
        hmmsearch(aaseqs+".domhr",hmmdb,aaseqs,mcpu=cpu)
        hmmsearch(naseqs+".domhr",rnadb,naseqs,mcpu=cpu)

        #Add HMMresults to refdb and cleanup unused seqs
        log.info("Adding query HMM results to database")
        status = makehmmsql.run(aaseqs+".domhr",orgdb)
        if not status:
            #If no genes were found report error
            log.error("No MLST genes could be found. Stop processing")
        log.info("Adding query RNA results to database")
        status = makehmmsql.run(naseqs+".domhr",orgdb,rna=True)
        os.remove(aaseqs)
        os.remove(naseqs)
        checkpoint = "w1-5"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    #Calculate mlst core genes and export
    mlstdir = os.path.join(resultdir,"mlstgenes")
    mlstpriority = os.path.join(resultdir,"mlstpriority.json")
    genematjson = os.path.join(resultdir,"mlstmatrix.json")
    dndsfile = os.path.join(os.path.dirname(os.path.realpath(__file__)),"dnds.json")
    mlstselection = []
    delorgs = []
    if checkpoint == "w1-5":
        allquery = [os.path.splitext(os.path.split(x)[-1])[0].replace(" ","_") for x in glob.glob(os.path.join(resultdir,"queryseqs","*.fna"))]
        ignoreorgs = list(selorgs.get("seloutgroups",[]))
        ignoreorgs.extend(allquery)

        #getmlstgenes.findsingles(orgdb,maxgenes=500,outdir=mlstdir)
        if os.path.exists(mlstpriority) and os.path.exists(genematjson):
            with open(genematjson,"r") as mfil, open(mlstpriority,"r") as pfil:
                mlstpriority = json.load(pfil)
                temp = json.load(mfil)
                genemat = temp["counts"]
                orgs = temp["orgs"]
                del temp
        else:
            genemat,orgs,mlstpriority = getgenematrix.getmat(orgdb,pct=0.5,pct2=1.0,bh=True,rna=True,savefil=genematjson,prifile=mlstpriority,dndsfile=dndsfile,ignoreorgs=allquery)
        mlstselection, delorgs, concat = getmlstselection(resultdir,mlstpriority,maxmlst,ignoreorgs=ignoreorgs,concat=concat,minmlst=minmlst)
        if "skip3" in skip.lower() or os.path.exists(os.path.join(resultdir,"usergenes.json")):
            #Export selected genes to mlst folder
            log.info("JOB_STATUS:: Writing MLST genes...")
            getgenes.writeallgenes(orgdb,mlstselection,delorgs,outdir=mlstdir,outgroups=selorgs.get("seloutgroups",None),pct=0.5,rename=True)
            checkpoint = "w1-6"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
        else:
            checkpoint = "w1-STEP3"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)

    if checkpoint == "w1-STEP3":
        log.info("JOB_STATUS:: Waiting for selected MLST genes")
        return "waiting"

    ## Align and trim all MLST genes
    aligndir = os.path.join(resultdir,"mlst_aligned")
    trimdir = os.path.join(resultdir,"mlst_trimmed")

    if checkpoint == "w1-6":
        log.info("JOB_STATUS:: Aligning MLST genes")
        log.info("JOB_PROGRESS::30/100")
        #align all
        processmlst(mlstdir,aligndir,cpu=cpu,fast=fast)
        checkpoint = "w1-6b"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    if checkpoint == "w1-6b":
        if filtMLST and filtMLST != "False":
            #Extra screen of MLST genes to remove outliers based on starting tree distance
            log.info("JOB_STATUS:: Screening for inconsistent MLST genes")
            log.info("JOB_PROGRESS::55/100")
            excludemlst = screenmlst(mlstdir,aligndir,cpu=cpu)
            log.info("JOB_STATUS:: Excluded MLST genes: %s"%excludemlst)

        #trim all
        log.info("JOB_STATUS:: Trimming alignments")
        log.info("JOB_PROGRESS::65/100")
        processmlst(aligndir,trimdir,cpu=cpu,trim=True)

        checkpoint = "w1-7"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    treedir = os.path.join(resultdir,"trees")
    finishedtree = ""
    #Build trees
    if checkpoint == "w1-7":
        log.info("JOB_PROGRESS::75/100")
        if concat and concat != "False":
            log.info("JOB_STATUS:: Running concatenated supermatrix phylogeny")
            concatfasta = os.path.join(resultdir,"concatMLST.fasta")
            partfile = os.path.join(resultdir,"nucpartition.txt")
            concatphylogeny(resultdir, concatfasta, partfile,cpu=cpu,model=model,bs=bs)
            checkpoint = "w1-F"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
        else:
            log.info("JOB_STATUS:: Running coalescent tree phylogeny")
            colphylogeny(resultdir,trimdir,cpu=cpu,model=model,bs=bs)
            checkpoint = "w1-F"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)

    if checkpoint == "w1-F":
        finaltree = os.path.join(resultdir,"final.tree")
        if os.path.exists(os.path.join(treedir,"concatTree.tree.treefile")) and concat:
            finishedtree = os.path.join(treedir,"concatTree.tree.treefile")
        elif os.path.exists(os.path.join(treedir,"summaryTree.tree")):
            finishedtree = os.path.join(treedir,"summaryTree.tree")
        #Copy final tree to root dir
        if finishedtree:
            fmat = 2 if bs or not concat else 5
            log.debug("Saving final tree... %s"%fmat)
            if not ete3helper.rerootTree(finishedtree,finaltree,fmat=fmat):
                shutil.copy(finishedtree,finaltree)
        else:
            log.error("Could not find final tree. Tree building failed")

        #Job finished? do some cleanup
        if not kf:
            if os.path.exists(orgdb):
                os.remove(orgdb)
            shutil.rmtree(os.path.join(resultdir,"queryseqs"))
            shutil.rmtree(os.path.join(resultdir,"mlst_trimmed"))
            shutil.rmtree(os.path.join(resultdir,"mlstgenes"))
            for oldfil in glob.glob(os.path.join(treedir,"*.model")):
                os.remove(oldfil)

Example #2

Show file

def startwf2(indir,resultdir,refdir="",checkpoint=False,reference="",model="GTR",bs=0,kf=False,maxmlst=100,
             cpu=1,mashmxdist=0.5,rnadb="",hmmdb="",fast=False):
    """WORKFLOW 2: Get all query genomes and identify reference tree to add sequences to"""
    if not checkpoint:
        checkpoint = "w2-0"
    if reference:
        checkpoint = "w2-3"
    queryseqs = os.path.join(resultdir,"queryseqs")

    #Parse all inputs
    if checkpoint == "w2-0":
        log.info("JOB_STATUS::Parsing all genomes...")
        if parsegenomes.parseall(indir,queryseqs):
            checkpoint = "w2-1"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::5/100")
        else:
            log.error("Problem parsing input genomes")
            return False

    #Run MASH distances
    mashresult = False
    if checkpoint == "w2-1":
        log.info("JOB_STATUS::Running MASH ANI estimation against reference sequences...")
        mashresult = mash.getdistances(queryseqs,resultdir,cpu=cpu,maxdist=mashmxdist)
        if mashresult:
            checkpoint = "w2-2"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::10/100")
        else:
            log.error("MASH distance failed")
            return False

    #Detect reference group
    if checkpoint == "w2-2":
        log.info("JOB_STATUS::Loading mash results...")
        if mashresult:
            pass
        elif os.path.exists(os.path.join(resultdir,"reflist.json")):
            with open(os.path.join(resultdir,"reflist.json"),"r") as fil:
                mashresult = json.load(fil)
                log.info("Loading mash results...")
        else:
            log.error("No Mash results to process")
            return False

        reference = getreference(mashresult,refdir)
        if reference:
            checkpoint = "w2-3"
            log.info("JOB_STATUS::Detected reference = %s"%reference)
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.info("JOB_PROGRESS::15/100")
        else:
            checkpoint = "w2-F"
            log.info("JOB_CHECKPOINT::%s"%checkpoint)
            log.error("No matching reference found for all query organisms")

    orgdb = os.path.join(resultdir,"allseqs.db")
    seqlist = glob.glob(os.path.join(queryseqs,"*.fna"))
    #Add to db and run HMM searches. Use model in reference folder, or use global models
    if checkpoint == "w2-3":
        #get file list of query orgs and add to db
        allorgs = makeseqsql.runlist(seqlist,orgdb,True,"",False)

        #Write newly added seqences to file
        naseqs = os.path.join(resultdir,"addedseqs.fna")
        aaseqs = os.path.join(resultdir,"addedseqs.faa")
        seqsql2fa.writefasta(orgdb,aaseqs,False,"",True)
        seqsql2fa.writefasta(orgdb,naseqs,True,"",True)

        #Run HMM searches
        log.info("JOB_STATUS:: Searching for MLST genes in query sequences...")
        log.info("JOB_PROGRESS::25/100")

        if not hmmdb:
            if os.path.exists(os.path.join(refdir,reference,"core.hmm")):
                hmmdb = os.path.join(refdir,reference,"core.hmm")
            else:
                hmmdb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"reducedcore.hmm")
        if not rnadb:
            rnadb = os.path.join(os.path.dirname(os.path.realpath(__file__)),"barnap_bact_rRna.hmm")

        hmmsearch(aaseqs+".domhr",hmmdb,aaseqs,mcpu=cpu)
        hmmsearch(naseqs+".domhr",rnadb,naseqs,mcpu=cpu)

        #Add HMMresults
        log.info("Adding query HMM results to database")
        status = makehmmsql.run(aaseqs+".domhr",orgdb)
        if not status:
            #If no genes were found report error
            log.error("No MLST genes could be found. Stop processing")
        log.info("Adding query RNA results to database")
        status = makehmmsql.run(naseqs+".domhr",orgdb,rna=True)
        os.remove(aaseqs)
        os.remove(naseqs)
        checkpoint = "w2-4"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    mlstdir = os.path.realpath(os.path.join(resultdir,"mlstgenes"))
    if not os.path.exists(mlstdir):
        os.makedirs(mlstdir)
    aligndir = os.path.realpath(os.path.join(resultdir,"mlst_aligned"))
    if not os.path.exists(aligndir):
        os.makedirs(aligndir)
    trimdir = os.path.realpath(os.path.join(resultdir,"mlst_trimmed"))
    if not os.path.exists(trimdir):
        os.makedirs(trimdir)

    # extract single copy results from genelist
    if checkpoint == "w2-4":
        genelist = [os.path.splitext(os.path.split(x)[1])[0] for x in glob.glob(os.path.join(refdir,reference,"*.fna"))]
        log.info("JOB_STATUS:: Writing MLST genes...")
        singles = [str(x) for x in getgenematrix.getmat(orgdb,rna=True,bh=True)[2][0] if x in genelist]
        getgenes.writeallgenes(orgdb,singles,[],outdir=mlstdir,outgroups=None,pct=0.5,writeaa=False,rename=True)

        log.info("JOB_STATUS:: Adding MLST genes to alignments...")
        addallalign([os.path.join(mlstdir,x+".fna") for x in singles], os.path.join(refdir,reference), aligndir, cpu=cpu,fast=fast)

        log.info("JOB_STATUS:: Trimming all alignments...")
        processmlst(aligndir,trimdir,cpu=cpu,trim=True)
        checkpoint = "w2-5"
        log.info("JOB_CHECKPOINT::%s"%checkpoint)

    treedir = os.path.realpath(os.path.join(resultdir,"trees"))
    if not os.path.exists(treedir):
        os.makedirs(treedir)

    #Add to prebuilt trees
    if checkpoint == "w2-5":
        log.info("JOB_PROGRESS::55/100")
        log.info("JOB_STATUS:: Adding to reference gene trees...")
        inlist = [os.path.join(trimdir,x) for x in os.listdir(trimdir)]
        addalltrees(inlist,os.path.join(refdir,reference),treedir,cpu=cpu)
        checkpoint = "w2-6"
        log.info("JOB_CHECKPOINT::%s" % checkpoint)

    if checkpoint == "w2-6":
        log.info("JOB_PROGRESS::85/100")
        log.info("JOB_STATUS:: Running coalescent tree phylogeny")
        #Combine all trees using ASTRAL
        flist = glob.glob(os.path.join(treedir,"RAxML_labelledTree.*.tree"))
        #reformat newick trees for astral
        for pbt in flist:
            with open(pbt,"r") as ifil, open(pbt+".newick","w") as ofil:
                x = ifil.next()
                ofil.write(re.sub("\[I\d+?\]|\"|'|QUERY___","",x))
        flist = glob.glob(os.path.join(treedir,"*.newick"))

        alltrees = catTrees(flist,os.path.join(treedir,"alltrees.tree"))
        coltree = os.path.join(treedir,"summaryTree.tree")
        runAstral(resultdir,alltrees,coltree)

        checkpoint = "w2-F"
        log.info("JOB_CHECKPOINT::%s" % checkpoint)

    finishedtree=""
    if checkpoint == "w2-F":
        finaltree = os.path.join(resultdir,"final.tree")
        if os.path.exists(os.path.join(treedir,"summaryTree.tree")):
            finishedtree = os.path.join(treedir,"summaryTree.tree")
        #Copy final tree to root dir
        if finishedtree:
            fmat = 2
            log.debug("Saving final tree... %s"%fmat)
            if not ete3helper.rerootTree(finishedtree,finaltree,fmat=fmat):
                shutil.copy(finishedtree,finaltree)
        else:
            log.error("Could not find final tree. Tree building failed")

Example #3

Show file

File: uplb_workflow.py Project: emzodls/automlst_mod

def startUPLBwf(indir,
                resultdir,
                checkpoint=False,
                concat=False,
                mashmxdist=0.5,
                cpu=1,
                skip="",
                refdb="",
                hmmdb="",
                rnadb="",
                maxmlst=100,
                model="GTR",
                bs=0,
                kf=False,
                maxorg=50,
                filtMLST=True,
                fast=False,
                minmlst=10):
    """WORKFLOW 1: Build phylogeny from scratch"""
    if not checkpoint:
        checkpoint = "w1-0"
    queryseqs = os.path.join(resultdir, "queryseqs")

    #Parse all inputs
    if checkpoint == "w1-0":
        log.info("JOB_STATUS::Parsing all genomes...")
        if parsegenomes.parseall(indir, queryseqs):
            checkpoint = "w1-1"
            log.info("JOB_CHECKPOINT::%s" % checkpoint)
            log.info("JOB_PROGRESS::5/100")
        else:
            log.error("Problem parsing input genomes")
            return False

    #Run MASH distances
    mashresults = False
    if checkpoint == "w1-1":
        for idx, queryseq in enumerate(queryseqs):
            log.info(
                "JOB_STATUS::Running MASH ANI estimation against reference sequences...{}"
                .format(len(queryseqs)))
            mashresult = mash.getdistances([queryseq],
                                           resultdir,
                                           cpu=cpu,
                                           maxdist=mashmxdist)
            if mashresult:
                #        checkpoint = "w1-2"
                log.info("JOB_CHECKPOINT::%s" % checkpoint)
                log.info("JOB_PROGRESS::Seq {} of {}".format(
                    idx + 1, len(queryseqs)))
                mashresults.append(mashresult)
            else:
                log.error("MASH distance failed")
                return False

    #Get set of organisms to build seq DB
    selorgDict = False
    if checkpoint == "w1-2":
        log.info("JOB_STATUS::Loading mash results...")
        for mashresult in mashresults:
            selorgs = getorgs(resultdir,
                              mashresult,
                              skip=skip,
                              IGlimit=maxorg,
                              minorgs=25)
        selorgDict = zip(queryseqs, )
    with open(autosel, "w") as fil:
        json.dump(selection, fil)

    print(selorgs)