Esempio n. 1
0
def bppSite(bppFile, bppMixed, alnFile, alnFormat, treeFile, lModels, outDir,
            baseName, logger):
    # outDir=os.getcwd()+"/"  # used to debug
    logger.info(os.getcwd())
    ### SITE ANALYSIS: BIO++
    logger.info("Bio++ Site Analysis")
    logger.info("Models to be run: {:s}".format(", ".join(
        model for model in lModels)))
    logger.info("Bppml parameter file: {:s}".format(bppFile))

    nodes = PSPFunc.nbNode(treeFile, logger)
    ## Bppml
    """ 
	Optimize tree and model using bppml
	Variables to include are
		INPUTFILE - alignement file
		FORMAT - format of the aln file (here, phyx)
		TREEFILE - tree file for the analyzed aln
		MODEL - choose which model you want run on the data YNGP_M0 through 8, same models as PAML, and DFP07 models
		NODES - number of nodes in the tree file
		IGNORE - parameters to ignore for optimization, for example if one is fixated (ex: omegas in M8a)
		OUTTREE - name of the optimized output tree
		OUTPARAMS - name of the output file summarizing parameters
		BACKUP - name of log file
	"""

    # Bppml output file names - dictionaries that associate model number with output file name for the model
    outSite = outDir + "bpp_site/"
    if not os.path.exists(outSite):
        subprocess.Popen("mkdir " + outSite, shell=True).wait()

    outFileName = outSite + baseName
    dModelTrees = {
        model: outFileName + "_" + model + ".dnd"
        for model in lModels
    }
    dModelParams = {
        model: outFileName + "_" + model + ".params"
        for model in lModels
    }
    dModelLog = {
        model: outFileName + "_optimization_" + model
        for model in lModels
    }
    dModelSyntax = {
        model: ["YNGP_" + model, "frequencies=F3X4(initFreqs=observed)"]
        for model in lModels if model[0] == "M"
    }  # dictionary model number - [MODEL name, MODEL arguments for bppml]
    dModelSyntax.update({
        model: [
            model[:5], "protmodel=JTT92",
            "frequencies=F3X4(initFreqs=observed)", "p0=1"
        ]
        for model in lModels if model[:5] == "DFP07"
    })
    dLogLlh = {}  # dictionary(model:logllh)

    for model in lModels:
        # take into account the specificities of each model (number of classes n for example)
        if model == "M7" or model == "M8":
            dModelSyntax[model].append("n=4")
        if model[0] == "M" and len(model) > 2:
            dModelSyntax[model][0] = dModelSyntax[model][0][:-1]
            dModelSyntax[model].append("omegas=1")

        # Use previous backup file (in order M0->M1->M2->M7->M8) to accelerate optimization
        # dictionary of equivalences of specific parameter names between models
        dequiv = {}
        ## omega from M0->M1->M2->M7 & M0->DFP07
        dequiv["omega"] = {
            "M1": {
                "YNGP_M1.omega": "omega"
            },
            "M2": {
                "YNGP_M2.omega0": "omega"
            },
            "M0": {
                "YN98.omega": "omega"
            },
            "M7": {
                "YNGP_M7.p": "[omega/(1-omega),1][omega==1]",
                "YNGP_M7.q": "1"
            },
            "M8": {
                "YNGP_M8.p": "[omega/(1-omega),1][omega==1]",
                "YNGP_M8.q": "1"
            },
            "DFP07_0": {
                "DFP07.omega": "omega"
            },
            "DFP07": {
                "DFP07.omega": "omega",
                "DFP07.p0": "0.1"
            }
        }  #0.1 to avoid optim stuck at p0=1
        dnewpar = {}

        if not os.path.exists(dModelLog[model]):
            prevmodel = ""
            if model[0] == "M":
                for prevmodel in ["M7", "M2", "M1", "M0"]:
                    if not prevmodel in lModels or not os.path.exists(
                            dModelLog[prevmodel] + ".def"):
                        prevmodel = ""
                    else:
                        break
            elif model[:5] == "DFP07":
                for prevmodel in ["DFP07_0", "M0"]:
                    if not prevmodel in lModels or not os.path.exists(
                            dModelLog[prevmodel] + ".def"):
                        prevmodel = ""
                    else:
                        break

            if prevmodel != "":
                logger.info("Optimization for model " + model +
                            " uses optimized parameters from model " +
                            prevmodel)
                fprev = open(dModelLog[prevmodel] + ".def", "r")
                lprev = list(fprev.readlines())
                fprev.close()

                dprevpar = {
                    l[:l.find("=")]: l[l.find("=") + 1:]
                    for l in lprev
                }

                # first copy all parameters
                for st, val in dprevpar.items():
                    if prevmodel == "M0":
                        if model[0] == "M":
                            nst = st.replace("YN98", "YNGP_" + model)
                        else:
                            nst = st.replace("YN98", "DFP07")
                    else:
                        nst = st.replace(prevmodel, model)

                    if not nst in dnewpar.keys():
                        dnewpar[nst] = val

                # And then for specific parameters
                for key, par in dequiv.items():
                    if model in par.keys() and prevmodel in par.keys():
                        parav = par[prevmodel]
                        parap = par[model]
                        for oname, oval in dprevpar.items():
                            ## look which oname is in equivalence list
                            for kparav in parav.keys():
                                if oname.startswith(kparav + "_"):
                                    for npar, nexp in parap.items():
                                        nname = oname.replace(kparav, npar)
                                        nval = str(
                                            eval(
                                                nexp.replace(key,
                                                             oval).strip()))
                                        if not nname in dnewpar.keys():
                                            dnewpar[nname] = nval

                #            break
                # write in backup file
                if len(dnewpar) != 0:
                    fnew = open(dModelLog[model], "w")
                    for k, v in dnewpar.items():
                        fnew.write(k + "=" + v.strip() + "\n")
                    fnew.close()

        # if M0 optimization in models, use tree optimized in M0 for subsequent model optimizations
        lignore = []
        if model != "M0" and "M0" in lModels:
            treeFile = dModelTrees["M0"] + "_1"
            lignore.append("BrLen")

        if model == "M8a":
            lignore.append("YNGP_M8.omegas*")

        if model == "DFP07_0":
            lignore.append("DFP07.p0_1")

# do not re-optimize root & equilibrium if done before
        if prevmodel != "":
            logger.info("Optimization for model " + model +
                        " does not re-optimize root frequencies")
            lignore.append("Ancient")

            logger.info("Optimization for model " + model +
                        " does not re-optimize equilibrium frequencies")
            lignore.append("*_Full.theta*")
        ignore = ",".join(lignore)

        # create dictionary with all elements of the two argument lists to build commands
        modelDesc = dModelSyntax[model][0] + "(" + ",".join(
            dModelSyntax[model][1:]) + ")"
        dBppCmd = {
            "INPUTFILE": alnFile,
            "FORMAT": alnFormat,
            "TREEFILE": treeFile,
            "MODEL": modelDesc,
            "NODES": nodes,
            "IGNORE": ignore,
            "OUTTREE": dModelTrees[model],
            "OUTPARAMS": dModelParams[model],
            "BACKUP": dModelLog[model],
            "param": bppFile
        }

        # running bppml
        logger.info("Running {:s} optimization".format(model))

        # join each couple of the cmd dictionary so that it reads "k1 = v1" "k2 = v2" etc...
        argsMx = "\"" + "\" \"".join([k + "=" + v
                                      for k, v in dBppCmd.items()]) + "\""
        logger.debug("bppml " + argsMx)
        runMx = subprocess.Popen("bppml " + argsMx,
                                 shell=True,
                                 stdout=subprocess.PIPE).wait()
        logger.debug(subprocess.PIPE)

        # fill dictionary with loglikelihoods of each model
        if os.path.exists(dModelParams[model]):
            with open(dModelParams[model], "r") as params:
                dLogLlh[model] = float(
                    params.readline().strip().split("= ")[-1])
                params.close()
                logger.info("Log Likelihood = {}".format(dLogLlh[model]))
        else:
            logger.info(
                "Possible failed optimization, likelihood has not been calculated."
            )

    # perform LRT
    # M1 vs M2
    if "M1" and "M2" in lModels:
        if "M1" and "M2" in dLogLlh:
            LR12, p12 = PSPFunc.LRT(dLogLlh["M1"], dLogLlh["M2"], 2)
            logger.info("LRT of M1 vs M2: {}".format(p12))
        else:
            logger.info(
                "Possible failed optimization, likelihoods of M1 and M2 have not been computed."
            )
    if "M7" and "M8" in lModels:
        if "M7" and "M8" in dLogLlh:
            LR78, p78 = PSPFunc.LRT(dLogLlh["M7"], dLogLlh["M8"], 2)
            logger.info("LRT of M7 vs M8: {}".format(p78))
        else:
            logger.info(
                "Possible failed optimization, likelihoods of M7 and M8 have not been computed."
            )
    if "M8" and "M8a" in lModels:
        if "M8" and "M8a" in dLogLlh:
            LR88a, p88a = PSPFunc.LRT(dLogLlh["M8a"], dLogLlh["M8"], 1)
            ts88a = 0.5 * p88a + 0.5
            logger.info("LRT of M8 vs M8a: {} (Treshold: {})".format(
                p88a, ts88a))
        else:
            logger.info(
                "Possible failed optimization, likelihoods have not been computed."
            )
    if "DFP07" and "DFP07_0" in lModels:
        if "DFP07" and "DFP07_0" in dLogLlh:
            LRDFP, pDFP = PSPFunc.LRT(dLogLlh["DFP07_0"], dLogLlh["DFP07"], 1)
            tsDFP = 0.5 * pDFP + 0.5
            logger.info("LRT of DFP07 vs DFP07_07: {} (Treshold: {})".format(
                pDFP, tsDFP))
        else:
            logger.info(
                "Possible failed optimization, likelihoods have not been computed."
            )

    # Bppmixedlikelihoods
    """ 
	Optimize tree and model using bppml
	Variables to include are
		INPUTFILE - alignement file
		FORMAT - format of the aln file (here, phyx)
		TREEFILE - tree file for the analyzed aln
		PARAMS - .params file from model optimization (bppml)
		OUTINFO - name of the results file (info about sites etc.)
	"""

    for model in lModels:
        # use tree optimized in M0 for each model
        if "M0" in lModels:
            treeFile = dModelTrees["M0"] + "_1"
        else:
            treeFile = dModelTrees[model] + "_1"

        if model in ["M0", "DFP07_0"]:
            continue

        # dictionary(model:results file name)
        dModelResults = {
            model: outSite + baseName + "_results_" + model + ".log"
            for model in lModels
        }

        dMixCmd = {
            "INPUTFILE": alnFile,
            "FORMAT": alnFormat,
            "TREEFILE": treeFile,
            "params": dModelParams[model],
            "OUTINFO": dModelResults[model],
            "param": bppMixed
        }

        logger.info("Running mixed likelihoods with model {:s}".format(model))
        argsMx = "\"" + "\" \"".join([k + "=" + v
                                      for k, v in dMixCmd.items()]) + "\""
        logger.debug("bppmixedlikelihoods " + argsMx)
        runMx = subprocess.Popen("bppmixedlikelihoods " + argsMx,
                                 shell=True,
                                 stdout=subprocess.PIPE).wait()
        logger.debug(subprocess.PIPE)
Esempio n. 2
0
def pspAnalysis(data, parms, aln, tree):
    """
	procedure which execute functions for psp step

	@param1 data: basicData object
	"""
    logger = logging.getLogger("main.positiveSelection")
    dCtrls, lModels = PSPFunc.getParams(parms["models"], parms["paml"],
                                        parms["bppml"],
                                        parms["mixedlikelihood"],
                                        parms["busted"], parms["meme"],
                                        parms["opb"], parms["gnh"])
    timeStamp = strftime("%Y%m%d%H%M", localtime())

    outDir = data.o + "positive_selection_results_" + timeStamp + "/"
    if not os.path.exists(outDir):
        os.makedirs(outDir)

    logger.info(":" + tree + ":")

    cladoFile = PSPFunc.supBoot(outDir, data.baseName, tree, logger)

    ### Terminal output for user
    logger.info("Output directory: {:s}".format(outDir))
    logger.info("Alignement: {:s}".format(aln))
    logger.info("Alignement is in {:s} format.".format(data.alnFormat))
    logger.info("Tree: {:s}".format(tree))

    ### Run the different analysis as determined by control file
    logger.info("Starting positive selection analyses.")
    logger.info("POSITIVE SELECTION ANALYSIS: ")
    logger.info("Analysis to be run:")

    dAnalysis = {
        "paml": "Site (codeml)",
        "BUSTED": "Whole-Gene",
        "bppml": "Site (Bio++ - Optimization)",
        "bppmixedlikelihood": "Site (Bio++ - Results)",
        "OPB": "Branch",
        "GNH": "Branch-site on positively selected branches",
        "MEME": "Branch-site"
    }
    for key in dCtrls.keys():
        logger.info(dAnalysis[key])

    if "BUSTED" in dCtrls:
        GeneAnalysis.hyphyBusted(aln, cladoFile, outDir, data.baseName, logger)
        """try:		
			GeneAnalysis.hyphyBusted(aln, cladoFile, outDir, data.baseName, logger)
		except Exception:
			logger.info("BUSTED encountered an unexpected error, skipping.")"""

    if "MEME" in dCtrls:
        try:
            BranchAnalysis.memeBranchSite(aln, cladoFile, outDir,
                                          data.baseName, logger)
        except Exception:
            logger.error("MEME encountered an unexpected error, skipping.")

    if "bppml" in dCtrls:
        #	  try:
        if not dCtrls["bppmixedlikelihood"]:
            dCtrls["bppmixedlikelihood"] = dCtrls["bppml"]
        SiteAnalysis.bppSite(dCtrls["bppml"], dCtrls["bppmixedlikelihood"],
                             aln, data.alnFormat, tree, lModels, outDir,
                             data.baseName, logger)
#	  except Exception:
#	    logger.error("Bio++ Site encountered an unexpected error, skipping.")

    lPSNodes = []
    if "OPB" in dCtrls:
        #		try:
        params = BranchAnalysis.bppBranch(dCtrls["OPB"], outDir, data.baseName,
                                          aln, data.alnFormat, tree, logger)
    # except Exception:
    # 	logger.error("Bio++ Branch Analysis encountered an unexpected error, skipping.")

    if "OPB" and "GNH" in dCtrls and len(lPSNodes) > 1:
        #		try:
        BranchAnalysis.bppBranchSite(dCtrls["GNH"], lPSNodes, outDir,
                                     data.baseName, aln, data.alnFormat, tree,
                                     logger)
    # except Exception:
    # 	logger.error("Bio++ Pseudo Branch-Site Analysis encountered an unexpected error, skipping.")

    if "paml" in dCtrls:
        SiteAnalysis.pamlSite(aln, tree, lModels, dCtrls["paml"], outDir,
                              data.baseName, logger)
        """try:
			SiteAnalysis.pamlSite(aln, tree, lModels, dCtrls["paml"], outDir, data.baseName, logger)
		except Exception:
			logger.info("PAML (codeml) Site encountered an unexpected error, skipping.")"""

    logger.info("Finished positive selection analyses.")
    return (outDir)
Esempio n. 3
0
def paramDef(params, inf, queryName, outdir):
    """
	Check the parameters in the file.

	@param inf: path's file
	@return defaultParam: dico of parameters
	"""

    params = params.strip()
    if not os.path.exists(params):
        print("The provided parameter file does not exist, try again.")
        sys.exit()

    #Parsing
    lParams = [
        "infile", "queryName", "queryFile", "blastdb", "outdir", "logfile",
        "evalue", "mincov", "percID", "maxLen", "step", "remote", "entryQuery",
        "sptree", "APIKey", "phymlOpt", "recombination", "duplication",
        "LBopt", "nbspecies", "positiveSelection", "basename", "hyphySeuil",
        "busted", "meme", "models", "paml", "bppml", "mixedlikelihood", "opb",
        "gnh"
    ]

    with open(params, mode="r", encoding="utf-8") as content:
        dParams = {}
        for line in content:
            if line.startswith("#"):
                pass
            else:
                temp = list(map(str.strip, line.split(":")))
                if temp[0] == "":
                    continue
                if temp[0] not in lParams:
                    print(temp[0] + " is not a valid parameter.\n")
                else:
                    dParams[temp[0]] = temp[1].strip()
        content.close()

    #If infile(s) given through command line, takes priority
    if inf != "":
        dParams["infile"] = list(map(str.strip, inf.split(",")))
    else:
        dParams["infile"] = list(map(str.strip, dParams["infile"].split(",")))

    #Idem queryName
    if queryName != "":
        dParams["queryName"] = queryName
    else:
        dParams["queryName"] = dParams["queryName"]

    #Idem outdir
    if outdir != "":
        dParams["outdir"] = outdir
    else:
        dParams["outdir"] = dParams["outdir"]

    #If list of file given, split and check what each file is
    if len(dParams["infile"]) > 1:
        for entryfile in dParams["infile"]:
            if FormatFunc.isCCDSFasta(entryfile):
                dParams["queryFile"] = os.path.abspath(entryfile)
            if FormatFunc.isAln(entryfile):
                dParams["alnfile"] = os.path.abspath(entryfile)
            if FormatFunc.isTree(entryfile):
                dParams["treefile"] = os.path.abspath(entryfile)
    else:
        dParams["queryFile"] = os.path.abspath(dParams["infile"][0])

    if "queryFile" in dParams.keys() and dParams["queryFile"] != "":
        dParams["infile"] = dParams["queryFile"]
    elif "alnfile" in dParams.keys() and dParams["alnfile"] != "":
        dParams["infile"] = dParams["alnfile"]

    answers = ["Y", "YES", "T", "TRUE"]
    negAnswers = ["N", "NO", "F", "FALSE"]

    for param in dParams.keys():
        if type(dParams[param]) is not list:
            if dParams[param].upper() in answers:
                dParams[param] = True
            elif dParams[param].upper() in negAnswers:
                dParams[param] = False

    #Check if parameters are correct
    lSteps = [
        "blast", "accessions", "fasta", "orf", "alignment", "tree",
        "duplication", "recombination", "positiveSelection", ""
    ]

    if "step" not in dParams or dParams["step"] not in lSteps:
        print("Step \"" + dParams["step"] +
              "\" not available, set to blast by default.")
        dParams["step"] = "blast"
    if dParams["step"] == "":
        dParams["step"] = "blast"

    if "remote" not in dParams or dParams["remote"] == "":
        print("Remote option needs to be a boolean, set to True by default.")
        dParams["remote"] = True

    if "positiveSelection" not in dParams:
        print(
            "Positive selection analyses will not be executed, set to False by default."
        )
        dParams["positiveSelection"] = False

    elif dParams["positiveSelection"]:
        if dParams["step"] == "positiveSelection":
            if "treefile" not in dParams or dParams["treefile"] == "":
                print(
                    "The pipeline requires a phylogenetic tree. Please provide one."
                )
                sys.exit()
            elif "alnfile" not in dParams or dParams["alnfile"] == "":
                print(
                    "The pipeline requires a codon alignment. Please provide one."
                )
                sys.exit()
        for opt in [
                "meme", "busted", "models", "paml", "bppml", "mixedlikelihood",
                "opb", "gnh"
        ]:
            if opt not in dParams:
                dParams[opt] = ""

            elif opt not in ["meme", "busted", "models", "paml"]:
                if type(dParams[opt]) is not bool and os.path.exists(
                        dParams[opt].strip("\n")):
                    dParams[opt] = dParams[opt].strip("\n")
                elif dParams[opt]:
                    path = "/".join(dParams["infile"].split("/")
                                    [:-1]) + "/" + opt + "_params.bpp"
                    PSPFunc.pspFileCreation(path, opt)
                    dParams[opt] = path

            elif opt == "models":
                ltemp = []
                for M in map(str.strip, dParams[opt].split(",")):
                    if M == "":
                        next
                    elif M not in [
                            "M0", "M1", "M2", "M7", "M8", "M8a", "DFP07_0",
                            "DFP07"
                    ]:
                        print(M + " isn't a valid model.")
                    else:
                        ltemp.append(M)
                dParams[opt] = ",".join(ltemp)

    elif dParams["step"] == "positiveSelection":
        print(
            "Error: positiveSelection option set to false and step set to positiveSelection."
        )
        sys.exit()

    if dParams["step"] in ["blast", "accessions", "fasta"]:
        if dParams["infile"] == "" or dParams["blastdb"] == "":
            print("Infile and Blastdb are necessary.")
            sys.exit()

    #Creation of a dictionnary with all the parameters
    defaultParam = {
        "infile": "",
        "queryName": "",
        "queryFile": "",
        "blastdb": "",
        "outdir": "",
        "logfile": "",
        "evalue": 1e-3,
        "mincov": 50,
        "percID": 70,
        "maxLen": "cutoff",
        "entryQuery": "",
        "APIKey": "",
        "phymlOpt": "",
        "sptree": "",
        "duplication": False,
        "LBopt": "cutoff",
        "nbspecies": 8,
        "recombination": False,
        "remote": False,
        "step": "blast",
        "positiveSelection": False,
        "alnfile": "",
        "treefile": "",
        "alnformat": "Fasta",
        "basename": "",
        "hyphySeuil": 0.05,
        "busted": False,
        "meme": False,
        "models": "",
        "paml": "",
        "bppml": "",
        "mixedlikelihood": "",
        "opb": False,
        "gnh": False
    }

    for i in defaultParam:
        if i in dParams.keys() and dParams[i] != "":
            defaultParam[i] = dParams[i]

    return defaultParam
Esempio n. 4
0
def bppBranch(OPBFile, outDir, baseName, alnFile, alnFormat, treeFile, logger):
    ### BRANCH ANALYSIS: BIO++ ONE PER BRANCH

    logger.info("One Per Branch (BIO++)")
    logger.info("OPB parameter file: {:s}".format(OPBFile))

    outOPB = outDir + "bpp_branch/"
    if not os.path.exists(outOPB):
        os.makedirs(outOPB)

    model = "M2"

    outFileName = outOPB + baseName
    outTree = outFileName + "_" + model + ".dnd"
    outParams = outFileName + "_" + model + ".params"
    outBackup = outFileName + "_optimization_" + model

    # create dictionary with all elements of the two argument lists to build commands
    dBppCmd = {
        "INPUTFILE":
        alnFile,
        "FORMAT":
        alnFormat,
        "TREEFILE":
        treeFile,
        "OUTTREE":
        outTree,
        "OUTPARAMS":
        outParams,
        "BACKUP":
        outBackup,
        "model1":
        "YNGP_" + model + "(frequencies=F3X4(initFreqs=observed))",
        "param":
        OPBFile,
        "process1":
        "OnePerBranch(model=1, tree=1, rate=1, root_freq=1, shared_parameters=(*kappa, *Full.theta*))"
    }
    # running bppml
    logger.info("Running Branch optimization")

    ### look for previous M0 optim
    outSite = outDir + "bpp_site/"
    outSiteFileName = outSite + baseName

    lModels = ["M0", "M2"]
    dPrevModelLog = {
        model: outSiteFileName + "_optimization_" + model
        for model in lModels
    }
    prevmodel, dnewpar = getNewParfromOptim(model, lModels, dPrevModelLog,
                                            logger)
    lignore = []
    if prevmodel != "":
        fnew = open(outBackup, "w")
        for k, v in dnewpar.items():
            fnew.write(k + "=" + v.strip() + "\n")
        fnew.close()
    lignore = setIgnoreParams(model, prevmodel, lModels, logger)

    dBppCmd["IGNORE"] = ",".join(lignore)

    # join each couple of the cmd dictionary so that it reads "k1 = v1" "k2 = v2" etc...
    argsOPB = "bppml \"" + "\" \"".join(
        [k + "=" + v for k, v in dBppCmd.items()]) + "\""
    logger.debug(argsOPB)
    runOPB = cmd(argsOPB, False)

    # test each branch
    # Scan all parameter names
    fback = open(dBppCmd["BACKUP"] + ".def", "r")
    dparam = {
        param.split("=")[0]: float(param.split("=")[1])
        for param in fback.readlines()
    }
    fback.close()

    valM2 = float(dparam["f(x)"])

    # cp outParams for each branch with M2 replaced with M1
    # only for the where theta1 < 0.999 & theta2 < 0.999

    fparam = open(outParams, "r")
    lcmd = [l for l in fparam.readlines()]
    fparam.close()

    ## Look for correspondance  model_nb <-> node_id
    lprocess = [l for l in lcmd if l[:7] == "process"][0]
    lid = lprocess.split(".nodes_id=(")
    cormodid = {}
    for i in range(0, len(lid), 2):
        mod = int(lid[i][lid[i].rfind("l") + 1:])
        idi = int(lid[i + 1][:lid[i + 1].find(")"):])
        cormodid[idi] = mod

## Compute lk for each node with theta1_mod * theta2_mod < 0.999:
    fresbranch = open(outFileName + "_branch.txt", "w")
    fresbranch.write("Id\tomega2\tprop\tM2\tM1\tLR\tp\n")
    del (dBppCmd["process1"])
    del (dBppCmd["model1"])
    for idi, mod in cormodid.items():
        if dparam["YNGP_M2.theta2_%d" % mod] * dparam["YNGP_M2.theta1_%d" %
                                                      mod] >= 0.999:
            continue
        fback = open(outBackup + "_%d" % mod, "w")
        [
            fback.write(key + "=" + str(val) + "\n")
            for key, val in dparam.items() if key != "YNGP_M2.theta2_%d" % mod
        ]
        fback.write("YNGP_M2.theta2_%d=1\n" % mod)
        fback.write("YNGP_M2.omega2_%d=1\n" % mod)
        fback.close()

        lignore2 = lignore[:] + [
            key for key in dparam if key not in
            ["YNGP_M2.theta1_%d" %
             mod, "YNGP_M2.omega0_%d" % mod]
        ]
        dBppCmd["IGNORE"] = ",".join(lignore2)
        dBppCmd["params"] = outParams
        dBppCmd["OUTPARAMS"] = outParams + "_%d" % idi
        dBppCmd["BACKUP"] = outBackup + "_%d" % mod

        argsOPB = "bppml \"" + "\" \"".join(
            [k + "=" + v for k, v in dBppCmd.items()]) + "\""
        #	  logger.debug(argsOPB)
        runOPB = cmd(argsOPB, False)

        fback = open(dBppCmd["BACKUP"] + ".def", "r")
        dparam2 = {
            param.split("=")[0]: float(param.split("=")[1])
            for param in fback.readlines()
        }
        fback.close()

        valM1 = float(dparam2["f(x)"])
        LR, p = PSPFunc.LRT(valM2, valM1, 2)
        fresbranch.write(
            "%d\t%f\t%f\t%f\t%f\t%f\t%f\n" %
            (idi, dparam["YNGP_M2.omega2_%d" % mod],
             (1 - dparam["YNGP_M2.theta2_%d" % mod]) *
             (1 - dparam["YNGP_M2.theta1_%d" % mod]), valM2, valM1, LR, p))
        if p < 0.05:
            logger.info("Node {:d} is interesting (w = {:f})".format(
                idi, dparam["YNGP_M2.omega2_%d" % mod]))
    fresbranch.close()
    return (outParams)
Esempio n. 5
0
def bppSite(bppFile, bppMixed, alnFile, alnFormat, treeFile, lModels, outDir,
            baseName, logger):
    # outDir=os.getcwd()+"/"  # used to debug
    logger.info(os.getcwd())
    ### SITE ANALYSIS: BIO++
    logger.info("Bio++ Site Analysis")
    logger.info("Models to be run: {:s}".format(", ".join(
        model for model in lModels)))
    logger.info("Bppml parameter file: {:s}".format(bppFile))

    ## Bppml
    """ 
	Optimize tree and model using bppml
	Variables to include are
		INPUTFILE - alignement file
		FORMAT - format of the aln file (here, phyx)
		TREEFILE - tree file for the analyzed aln
		MODEL - choose which model you want run on the data YNGP_M0 through 8, same models as PAML, and DFP07 models
		IGNORE - parameters to ignore for optimization, for example if one is fixated (ex: omegas in M8a)
		OUTTREE - name of the optimized output tree
		OUTPARAMS - name of the output file summarizing parameters
		BACKUP - name of log file
	"""

    # Bppml output file names - dictionaries that associate model number with output file name for the model
    outSite = outDir + "bpp_site/"
    if not os.path.exists(outSite):
        subprocess.Popen("mkdir " + outSite, shell=True).wait()

    outFileName = outSite + baseName
    dModelTrees = {
        model: outFileName + "_" + model + ".dnd"
        for model in lModels
    }
    dModelParams = {
        model: outFileName + "_" + model + ".params"
        for model in lModels
    }
    dModelLog = {
        model: outFileName + "_optimization_" + model
        for model in lModels
    }
    dModelSyntax = {
        model: ["YNGP_" + model, "frequencies=F3X4(initFreqs=observed)"]
        for model in lModels if model[0] == "M"
    }  # dictionary model number - [MODEL name, MODEL arguments for bppml]
    dModelSyntax.update({
        model:
        [model[:5], "protmodel=JTT92", "frequencies=F3X4(initFreqs=observed)"]
        for model in lModels if model[:5] == "DFP07"
    })
    # take into account the specificities of each model (number of classes n for example)
    for model in lModels:
        if model in ["M7", "M8"]:
            dModelSyntax[model].append("n=4")
            dModelSyntax[model].append("q=1")
        if model[0] == "M" and len(model) > 2:
            dModelSyntax[model][0] = dModelSyntax[model][0][:-1]
            dModelSyntax[model].append("omegas=1")
        if model[:5] == "DFP07":
            dModelSyntax[model].append(["p0=1", "p0=0.1"][model == "DFP07"])
    dLogLlh = {}  # dictionary(model:logllh)

    for model in lModels:
        prevmodel, dnewpar = getNewParfromOptim(model, lModels, dModelLog,
                                                logger)
        if prevmodel != "":
            fnew = open(dModelLog[model], "w")
            for k, v in dnewpar.items():
                fnew.write(k + "=" + v.strip() + "\n")
            fnew.close()

            lignore = setIgnoreParams(model, prevmodel, lModels, logger)
            ignore = ",".join(lignore)
        else:
            ignore = ""

        if model != "M0" and "M0" in lModels:
            treeFile = dModelTrees["M0"] + "_1"

# create dictionary with all elements of the two argument lists to build commands
        modelDesc = dModelSyntax[model][0] + "(" + ",".join(
            dModelSyntax[model][1:]) + ")"
        dBppCmd = {
            "INPUTFILE": alnFile,
            "FORMAT": alnFormat,
            "TREEFILE": treeFile,
            "MODEL": modelDesc,
            "IGNORE": ignore,
            "OUTTREE": dModelTrees[model],
            "OUTPARAMS": dModelParams[model],
            "BACKUP": dModelLog[model],
            "param": bppFile
        }

        # running bppml
        logger.info("Running {:s} optimization".format(model))

        # join each couple of the cmd dictionary so that it reads "k1 = v1" "k2 = v2" etc...
        argsMx = "\"" + "\" \"".join([k + "=" + v
                                      for k, v in dBppCmd.items()]) + "\""
        logger.debug("bppml " + argsMx)
        runMx = subprocess.Popen("bppml " + argsMx,
                                 shell=True,
                                 stdout=subprocess.PIPE).wait()
        logger.debug(subprocess.PIPE)

        # fill dictionary with loglikelihoods of each model
        if os.path.exists(dModelParams[model]):
            with open(dModelParams[model], "r") as params:
                dLogLlh[model] = float(
                    params.readline().strip().split("= ")[-1])
                params.close()
                logger.info("Log Likelihood = {}".format(dLogLlh[model]))
        else:
            logger.info(
                "Possible failed optimization, likelihood has not been calculated."
            )

    # perform LRT
    # M1 vs M2
    if "M1" and "M2" in lModels:
        if "M1" and "M2" in dLogLlh:
            LR12, p12 = PSPFunc.LRT(dLogLlh["M1"], dLogLlh["M2"], 2)
            logger.info("LRT of M1 vs M2: {}".format(p12))
        else:
            logger.info(
                "Possible failed optimization, likelihoods of M1 and M2 have not been computed."
            )
    if "M7" and "M8" in lModels:
        if "M7" and "M8" in dLogLlh:
            LR78, p78 = PSPFunc.LRT(dLogLlh["M7"], dLogLlh["M8"], 2)
            logger.info("LRT of M7 vs M8: {}".format(p78))
        else:
            logger.info(
                "Possible failed optimization, likelihoods of M7 and M8 have not been computed."
            )
    if "M8" and "M8a" in lModels:
        if "M8" and "M8a" in dLogLlh:
            LR88a, p88a = PSPFunc.LRT(dLogLlh["M8a"], dLogLlh["M8"], 1)
            ts88a = 0.5 * p88a + 0.5
            logger.info("LRT of M8 vs M8a: {} (Treshold: {})".format(
                p88a, ts88a))
        else:
            logger.info(
                "Possible failed optimization, likelihoods have not been computed."
            )
    if "DFP07" and "DFP07_0" in lModels:
        if "DFP07" and "DFP07_0" in dLogLlh:
            LRDFP, pDFP = PSPFunc.LRT(dLogLlh["DFP07_0"], dLogLlh["DFP07"], 1)
            tsDFP = 0.5 * pDFP + 0.5
            logger.info("LRT of DFP07 vs DFP07_07: {} (Treshold: {})".format(
                pDFP, tsDFP))
        else:
            logger.info(
                "Possible failed optimization, likelihoods have not been computed."
            )

# Bppmixedlikelihoods
    """ 
        Optimize tree and model using bppml
        Variables to include are
              INPUTFILE - alignement file
              FORMAT - format of the aln file (here, phyx)
              TREEFILE - tree file for the analyzed aln
              PARAMS - .params file from model optimization (bppml)
              OUTINFO - name of the results file (info about sites etc.)
	"""

    for model in lModels:
        # use tree optimized in M0 for each model
        if "M0" in lModels:
            treeFile = dModelTrees["M0"] + "_1"
        else:
            treeFile = dModelTrees[model] + "_1"

        if model in ["M0", "DFP07_0"]:
            continue

        # dictionary(model:results file name)
        dModelResults = {
            model: outSite + baseName + "_results_" + model + ".log"
            for model in lModels
        }

        dMixCmd = {
            "INPUTFILE": alnFile,
            "FORMAT": alnFormat,
            "TREEFILE": treeFile,
            "params": dModelParams[model],
            "OUTINFO": dModelResults[model],
            "param": bppMixed
        }

        logger.info("Running mixed likelihoods with model {:s}".format(model))
        argsMx = "\"" + "\" \"".join([k + "=" + v
                                      for k, v in dMixCmd.items()]) + "\""
        logger.debug("bppmixedlikelihoods " + argsMx)
        runMx = subprocess.Popen("bppmixedlikelihoods " + argsMx,
                                 shell=True,
                                 stdout=subprocess.PIPE).wait()
        logger.debug(subprocess.PIPE)