def bppSite(bppFile, bppMixed, alnFile, alnFormat, treeFile, lModels, outDir, baseName, logger): # outDir=os.getcwd()+"/" # used to debug logger.info(os.getcwd()) ### SITE ANALYSIS: BIO++ logger.info("Bio++ Site Analysis") logger.info("Models to be run: {:s}".format(", ".join( model for model in lModels))) logger.info("Bppml parameter file: {:s}".format(bppFile)) nodes = PSPFunc.nbNode(treeFile, logger) ## Bppml """ Optimize tree and model using bppml Variables to include are INPUTFILE - alignement file FORMAT - format of the aln file (here, phyx) TREEFILE - tree file for the analyzed aln MODEL - choose which model you want run on the data YNGP_M0 through 8, same models as PAML, and DFP07 models NODES - number of nodes in the tree file IGNORE - parameters to ignore for optimization, for example if one is fixated (ex: omegas in M8a) OUTTREE - name of the optimized output tree OUTPARAMS - name of the output file summarizing parameters BACKUP - name of log file """ # Bppml output file names - dictionaries that associate model number with output file name for the model outSite = outDir + "bpp_site/" if not os.path.exists(outSite): subprocess.Popen("mkdir " + outSite, shell=True).wait() outFileName = outSite + baseName dModelTrees = { model: outFileName + "_" + model + ".dnd" for model in lModels } dModelParams = { model: outFileName + "_" + model + ".params" for model in lModels } dModelLog = { model: outFileName + "_optimization_" + model for model in lModels } dModelSyntax = { model: ["YNGP_" + model, "frequencies=F3X4(initFreqs=observed)"] for model in lModels if model[0] == "M" } # dictionary model number - [MODEL name, MODEL arguments for bppml] dModelSyntax.update({ model: [ model[:5], "protmodel=JTT92", "frequencies=F3X4(initFreqs=observed)", "p0=1" ] for model in lModels if model[:5] == "DFP07" }) dLogLlh = {} # dictionary(model:logllh) for model in lModels: # take into account the specificities of each model (number of classes n for example) if model == "M7" or model == "M8": dModelSyntax[model].append("n=4") if model[0] == "M" and len(model) > 2: dModelSyntax[model][0] = dModelSyntax[model][0][:-1] dModelSyntax[model].append("omegas=1") # Use previous backup file (in order M0->M1->M2->M7->M8) to accelerate optimization # dictionary of equivalences of specific parameter names between models dequiv = {} ## omega from M0->M1->M2->M7 & M0->DFP07 dequiv["omega"] = { "M1": { "YNGP_M1.omega": "omega" }, "M2": { "YNGP_M2.omega0": "omega" }, "M0": { "YN98.omega": "omega" }, "M7": { "YNGP_M7.p": "[omega/(1-omega),1][omega==1]", "YNGP_M7.q": "1" }, "M8": { "YNGP_M8.p": "[omega/(1-omega),1][omega==1]", "YNGP_M8.q": "1" }, "DFP07_0": { "DFP07.omega": "omega" }, "DFP07": { "DFP07.omega": "omega", "DFP07.p0": "0.1" } } #0.1 to avoid optim stuck at p0=1 dnewpar = {} if not os.path.exists(dModelLog[model]): prevmodel = "" if model[0] == "M": for prevmodel in ["M7", "M2", "M1", "M0"]: if not prevmodel in lModels or not os.path.exists( dModelLog[prevmodel] + ".def"): prevmodel = "" else: break elif model[:5] == "DFP07": for prevmodel in ["DFP07_0", "M0"]: if not prevmodel in lModels or not os.path.exists( dModelLog[prevmodel] + ".def"): prevmodel = "" else: break if prevmodel != "": logger.info("Optimization for model " + model + " uses optimized parameters from model " + prevmodel) fprev = open(dModelLog[prevmodel] + ".def", "r") lprev = list(fprev.readlines()) fprev.close() dprevpar = { l[:l.find("=")]: l[l.find("=") + 1:] for l in lprev } # first copy all parameters for st, val in dprevpar.items(): if prevmodel == "M0": if model[0] == "M": nst = st.replace("YN98", "YNGP_" + model) else: nst = st.replace("YN98", "DFP07") else: nst = st.replace(prevmodel, model) if not nst in dnewpar.keys(): dnewpar[nst] = val # And then for specific parameters for key, par in dequiv.items(): if model in par.keys() and prevmodel in par.keys(): parav = par[prevmodel] parap = par[model] for oname, oval in dprevpar.items(): ## look which oname is in equivalence list for kparav in parav.keys(): if oname.startswith(kparav + "_"): for npar, nexp in parap.items(): nname = oname.replace(kparav, npar) nval = str( eval( nexp.replace(key, oval).strip())) if not nname in dnewpar.keys(): dnewpar[nname] = nval # break # write in backup file if len(dnewpar) != 0: fnew = open(dModelLog[model], "w") for k, v in dnewpar.items(): fnew.write(k + "=" + v.strip() + "\n") fnew.close() # if M0 optimization in models, use tree optimized in M0 for subsequent model optimizations lignore = [] if model != "M0" and "M0" in lModels: treeFile = dModelTrees["M0"] + "_1" lignore.append("BrLen") if model == "M8a": lignore.append("YNGP_M8.omegas*") if model == "DFP07_0": lignore.append("DFP07.p0_1") # do not re-optimize root & equilibrium if done before if prevmodel != "": logger.info("Optimization for model " + model + " does not re-optimize root frequencies") lignore.append("Ancient") logger.info("Optimization for model " + model + " does not re-optimize equilibrium frequencies") lignore.append("*_Full.theta*") ignore = ",".join(lignore) # create dictionary with all elements of the two argument lists to build commands modelDesc = dModelSyntax[model][0] + "(" + ",".join( dModelSyntax[model][1:]) + ")" dBppCmd = { "INPUTFILE": alnFile, "FORMAT": alnFormat, "TREEFILE": treeFile, "MODEL": modelDesc, "NODES": nodes, "IGNORE": ignore, "OUTTREE": dModelTrees[model], "OUTPARAMS": dModelParams[model], "BACKUP": dModelLog[model], "param": bppFile } # running bppml logger.info("Running {:s} optimization".format(model)) # join each couple of the cmd dictionary so that it reads "k1 = v1" "k2 = v2" etc... argsMx = "\"" + "\" \"".join([k + "=" + v for k, v in dBppCmd.items()]) + "\"" logger.debug("bppml " + argsMx) runMx = subprocess.Popen("bppml " + argsMx, shell=True, stdout=subprocess.PIPE).wait() logger.debug(subprocess.PIPE) # fill dictionary with loglikelihoods of each model if os.path.exists(dModelParams[model]): with open(dModelParams[model], "r") as params: dLogLlh[model] = float( params.readline().strip().split("= ")[-1]) params.close() logger.info("Log Likelihood = {}".format(dLogLlh[model])) else: logger.info( "Possible failed optimization, likelihood has not been calculated." ) # perform LRT # M1 vs M2 if "M1" and "M2" in lModels: if "M1" and "M2" in dLogLlh: LR12, p12 = PSPFunc.LRT(dLogLlh["M1"], dLogLlh["M2"], 2) logger.info("LRT of M1 vs M2: {}".format(p12)) else: logger.info( "Possible failed optimization, likelihoods of M1 and M2 have not been computed." ) if "M7" and "M8" in lModels: if "M7" and "M8" in dLogLlh: LR78, p78 = PSPFunc.LRT(dLogLlh["M7"], dLogLlh["M8"], 2) logger.info("LRT of M7 vs M8: {}".format(p78)) else: logger.info( "Possible failed optimization, likelihoods of M7 and M8 have not been computed." ) if "M8" and "M8a" in lModels: if "M8" and "M8a" in dLogLlh: LR88a, p88a = PSPFunc.LRT(dLogLlh["M8a"], dLogLlh["M8"], 1) ts88a = 0.5 * p88a + 0.5 logger.info("LRT of M8 vs M8a: {} (Treshold: {})".format( p88a, ts88a)) else: logger.info( "Possible failed optimization, likelihoods have not been computed." ) if "DFP07" and "DFP07_0" in lModels: if "DFP07" and "DFP07_0" in dLogLlh: LRDFP, pDFP = PSPFunc.LRT(dLogLlh["DFP07_0"], dLogLlh["DFP07"], 1) tsDFP = 0.5 * pDFP + 0.5 logger.info("LRT of DFP07 vs DFP07_07: {} (Treshold: {})".format( pDFP, tsDFP)) else: logger.info( "Possible failed optimization, likelihoods have not been computed." ) # Bppmixedlikelihoods """ Optimize tree and model using bppml Variables to include are INPUTFILE - alignement file FORMAT - format of the aln file (here, phyx) TREEFILE - tree file for the analyzed aln PARAMS - .params file from model optimization (bppml) OUTINFO - name of the results file (info about sites etc.) """ for model in lModels: # use tree optimized in M0 for each model if "M0" in lModels: treeFile = dModelTrees["M0"] + "_1" else: treeFile = dModelTrees[model] + "_1" if model in ["M0", "DFP07_0"]: continue # dictionary(model:results file name) dModelResults = { model: outSite + baseName + "_results_" + model + ".log" for model in lModels } dMixCmd = { "INPUTFILE": alnFile, "FORMAT": alnFormat, "TREEFILE": treeFile, "params": dModelParams[model], "OUTINFO": dModelResults[model], "param": bppMixed } logger.info("Running mixed likelihoods with model {:s}".format(model)) argsMx = "\"" + "\" \"".join([k + "=" + v for k, v in dMixCmd.items()]) + "\"" logger.debug("bppmixedlikelihoods " + argsMx) runMx = subprocess.Popen("bppmixedlikelihoods " + argsMx, shell=True, stdout=subprocess.PIPE).wait() logger.debug(subprocess.PIPE)
def pspAnalysis(data, parms, aln, tree): """ procedure which execute functions for psp step @param1 data: basicData object """ logger = logging.getLogger("main.positiveSelection") dCtrls, lModels = PSPFunc.getParams(parms["models"], parms["paml"], parms["bppml"], parms["mixedlikelihood"], parms["busted"], parms["meme"], parms["opb"], parms["gnh"]) timeStamp = strftime("%Y%m%d%H%M", localtime()) outDir = data.o + "positive_selection_results_" + timeStamp + "/" if not os.path.exists(outDir): os.makedirs(outDir) logger.info(":" + tree + ":") cladoFile = PSPFunc.supBoot(outDir, data.baseName, tree, logger) ### Terminal output for user logger.info("Output directory: {:s}".format(outDir)) logger.info("Alignement: {:s}".format(aln)) logger.info("Alignement is in {:s} format.".format(data.alnFormat)) logger.info("Tree: {:s}".format(tree)) ### Run the different analysis as determined by control file logger.info("Starting positive selection analyses.") logger.info("POSITIVE SELECTION ANALYSIS: ") logger.info("Analysis to be run:") dAnalysis = { "paml": "Site (codeml)", "BUSTED": "Whole-Gene", "bppml": "Site (Bio++ - Optimization)", "bppmixedlikelihood": "Site (Bio++ - Results)", "OPB": "Branch", "GNH": "Branch-site on positively selected branches", "MEME": "Branch-site" } for key in dCtrls.keys(): logger.info(dAnalysis[key]) if "BUSTED" in dCtrls: GeneAnalysis.hyphyBusted(aln, cladoFile, outDir, data.baseName, logger) """try: GeneAnalysis.hyphyBusted(aln, cladoFile, outDir, data.baseName, logger) except Exception: logger.info("BUSTED encountered an unexpected error, skipping.")""" if "MEME" in dCtrls: try: BranchAnalysis.memeBranchSite(aln, cladoFile, outDir, data.baseName, logger) except Exception: logger.error("MEME encountered an unexpected error, skipping.") if "bppml" in dCtrls: # try: if not dCtrls["bppmixedlikelihood"]: dCtrls["bppmixedlikelihood"] = dCtrls["bppml"] SiteAnalysis.bppSite(dCtrls["bppml"], dCtrls["bppmixedlikelihood"], aln, data.alnFormat, tree, lModels, outDir, data.baseName, logger) # except Exception: # logger.error("Bio++ Site encountered an unexpected error, skipping.") lPSNodes = [] if "OPB" in dCtrls: # try: params = BranchAnalysis.bppBranch(dCtrls["OPB"], outDir, data.baseName, aln, data.alnFormat, tree, logger) # except Exception: # logger.error("Bio++ Branch Analysis encountered an unexpected error, skipping.") if "OPB" and "GNH" in dCtrls and len(lPSNodes) > 1: # try: BranchAnalysis.bppBranchSite(dCtrls["GNH"], lPSNodes, outDir, data.baseName, aln, data.alnFormat, tree, logger) # except Exception: # logger.error("Bio++ Pseudo Branch-Site Analysis encountered an unexpected error, skipping.") if "paml" in dCtrls: SiteAnalysis.pamlSite(aln, tree, lModels, dCtrls["paml"], outDir, data.baseName, logger) """try: SiteAnalysis.pamlSite(aln, tree, lModels, dCtrls["paml"], outDir, data.baseName, logger) except Exception: logger.info("PAML (codeml) Site encountered an unexpected error, skipping.")""" logger.info("Finished positive selection analyses.") return (outDir)
def paramDef(params, inf, queryName, outdir): """ Check the parameters in the file. @param inf: path's file @return defaultParam: dico of parameters """ params = params.strip() if not os.path.exists(params): print("The provided parameter file does not exist, try again.") sys.exit() #Parsing lParams = [ "infile", "queryName", "queryFile", "blastdb", "outdir", "logfile", "evalue", "mincov", "percID", "maxLen", "step", "remote", "entryQuery", "sptree", "APIKey", "phymlOpt", "recombination", "duplication", "LBopt", "nbspecies", "positiveSelection", "basename", "hyphySeuil", "busted", "meme", "models", "paml", "bppml", "mixedlikelihood", "opb", "gnh" ] with open(params, mode="r", encoding="utf-8") as content: dParams = {} for line in content: if line.startswith("#"): pass else: temp = list(map(str.strip, line.split(":"))) if temp[0] == "": continue if temp[0] not in lParams: print(temp[0] + " is not a valid parameter.\n") else: dParams[temp[0]] = temp[1].strip() content.close() #If infile(s) given through command line, takes priority if inf != "": dParams["infile"] = list(map(str.strip, inf.split(","))) else: dParams["infile"] = list(map(str.strip, dParams["infile"].split(","))) #Idem queryName if queryName != "": dParams["queryName"] = queryName else: dParams["queryName"] = dParams["queryName"] #Idem outdir if outdir != "": dParams["outdir"] = outdir else: dParams["outdir"] = dParams["outdir"] #If list of file given, split and check what each file is if len(dParams["infile"]) > 1: for entryfile in dParams["infile"]: if FormatFunc.isCCDSFasta(entryfile): dParams["queryFile"] = os.path.abspath(entryfile) if FormatFunc.isAln(entryfile): dParams["alnfile"] = os.path.abspath(entryfile) if FormatFunc.isTree(entryfile): dParams["treefile"] = os.path.abspath(entryfile) else: dParams["queryFile"] = os.path.abspath(dParams["infile"][0]) if "queryFile" in dParams.keys() and dParams["queryFile"] != "": dParams["infile"] = dParams["queryFile"] elif "alnfile" in dParams.keys() and dParams["alnfile"] != "": dParams["infile"] = dParams["alnfile"] answers = ["Y", "YES", "T", "TRUE"] negAnswers = ["N", "NO", "F", "FALSE"] for param in dParams.keys(): if type(dParams[param]) is not list: if dParams[param].upper() in answers: dParams[param] = True elif dParams[param].upper() in negAnswers: dParams[param] = False #Check if parameters are correct lSteps = [ "blast", "accessions", "fasta", "orf", "alignment", "tree", "duplication", "recombination", "positiveSelection", "" ] if "step" not in dParams or dParams["step"] not in lSteps: print("Step \"" + dParams["step"] + "\" not available, set to blast by default.") dParams["step"] = "blast" if dParams["step"] == "": dParams["step"] = "blast" if "remote" not in dParams or dParams["remote"] == "": print("Remote option needs to be a boolean, set to True by default.") dParams["remote"] = True if "positiveSelection" not in dParams: print( "Positive selection analyses will not be executed, set to False by default." ) dParams["positiveSelection"] = False elif dParams["positiveSelection"]: if dParams["step"] == "positiveSelection": if "treefile" not in dParams or dParams["treefile"] == "": print( "The pipeline requires a phylogenetic tree. Please provide one." ) sys.exit() elif "alnfile" not in dParams or dParams["alnfile"] == "": print( "The pipeline requires a codon alignment. Please provide one." ) sys.exit() for opt in [ "meme", "busted", "models", "paml", "bppml", "mixedlikelihood", "opb", "gnh" ]: if opt not in dParams: dParams[opt] = "" elif opt not in ["meme", "busted", "models", "paml"]: if type(dParams[opt]) is not bool and os.path.exists( dParams[opt].strip("\n")): dParams[opt] = dParams[opt].strip("\n") elif dParams[opt]: path = "/".join(dParams["infile"].split("/") [:-1]) + "/" + opt + "_params.bpp" PSPFunc.pspFileCreation(path, opt) dParams[opt] = path elif opt == "models": ltemp = [] for M in map(str.strip, dParams[opt].split(",")): if M == "": next elif M not in [ "M0", "M1", "M2", "M7", "M8", "M8a", "DFP07_0", "DFP07" ]: print(M + " isn't a valid model.") else: ltemp.append(M) dParams[opt] = ",".join(ltemp) elif dParams["step"] == "positiveSelection": print( "Error: positiveSelection option set to false and step set to positiveSelection." ) sys.exit() if dParams["step"] in ["blast", "accessions", "fasta"]: if dParams["infile"] == "" or dParams["blastdb"] == "": print("Infile and Blastdb are necessary.") sys.exit() #Creation of a dictionnary with all the parameters defaultParam = { "infile": "", "queryName": "", "queryFile": "", "blastdb": "", "outdir": "", "logfile": "", "evalue": 1e-3, "mincov": 50, "percID": 70, "maxLen": "cutoff", "entryQuery": "", "APIKey": "", "phymlOpt": "", "sptree": "", "duplication": False, "LBopt": "cutoff", "nbspecies": 8, "recombination": False, "remote": False, "step": "blast", "positiveSelection": False, "alnfile": "", "treefile": "", "alnformat": "Fasta", "basename": "", "hyphySeuil": 0.05, "busted": False, "meme": False, "models": "", "paml": "", "bppml": "", "mixedlikelihood": "", "opb": False, "gnh": False } for i in defaultParam: if i in dParams.keys() and dParams[i] != "": defaultParam[i] = dParams[i] return defaultParam
def bppBranch(OPBFile, outDir, baseName, alnFile, alnFormat, treeFile, logger): ### BRANCH ANALYSIS: BIO++ ONE PER BRANCH logger.info("One Per Branch (BIO++)") logger.info("OPB parameter file: {:s}".format(OPBFile)) outOPB = outDir + "bpp_branch/" if not os.path.exists(outOPB): os.makedirs(outOPB) model = "M2" outFileName = outOPB + baseName outTree = outFileName + "_" + model + ".dnd" outParams = outFileName + "_" + model + ".params" outBackup = outFileName + "_optimization_" + model # create dictionary with all elements of the two argument lists to build commands dBppCmd = { "INPUTFILE": alnFile, "FORMAT": alnFormat, "TREEFILE": treeFile, "OUTTREE": outTree, "OUTPARAMS": outParams, "BACKUP": outBackup, "model1": "YNGP_" + model + "(frequencies=F3X4(initFreqs=observed))", "param": OPBFile, "process1": "OnePerBranch(model=1, tree=1, rate=1, root_freq=1, shared_parameters=(*kappa, *Full.theta*))" } # running bppml logger.info("Running Branch optimization") ### look for previous M0 optim outSite = outDir + "bpp_site/" outSiteFileName = outSite + baseName lModels = ["M0", "M2"] dPrevModelLog = { model: outSiteFileName + "_optimization_" + model for model in lModels } prevmodel, dnewpar = getNewParfromOptim(model, lModels, dPrevModelLog, logger) lignore = [] if prevmodel != "": fnew = open(outBackup, "w") for k, v in dnewpar.items(): fnew.write(k + "=" + v.strip() + "\n") fnew.close() lignore = setIgnoreParams(model, prevmodel, lModels, logger) dBppCmd["IGNORE"] = ",".join(lignore) # join each couple of the cmd dictionary so that it reads "k1 = v1" "k2 = v2" etc... argsOPB = "bppml \"" + "\" \"".join( [k + "=" + v for k, v in dBppCmd.items()]) + "\"" logger.debug(argsOPB) runOPB = cmd(argsOPB, False) # test each branch # Scan all parameter names fback = open(dBppCmd["BACKUP"] + ".def", "r") dparam = { param.split("=")[0]: float(param.split("=")[1]) for param in fback.readlines() } fback.close() valM2 = float(dparam["f(x)"]) # cp outParams for each branch with M2 replaced with M1 # only for the where theta1 < 0.999 & theta2 < 0.999 fparam = open(outParams, "r") lcmd = [l for l in fparam.readlines()] fparam.close() ## Look for correspondance model_nb <-> node_id lprocess = [l for l in lcmd if l[:7] == "process"][0] lid = lprocess.split(".nodes_id=(") cormodid = {} for i in range(0, len(lid), 2): mod = int(lid[i][lid[i].rfind("l") + 1:]) idi = int(lid[i + 1][:lid[i + 1].find(")"):]) cormodid[idi] = mod ## Compute lk for each node with theta1_mod * theta2_mod < 0.999: fresbranch = open(outFileName + "_branch.txt", "w") fresbranch.write("Id\tomega2\tprop\tM2\tM1\tLR\tp\n") del (dBppCmd["process1"]) del (dBppCmd["model1"]) for idi, mod in cormodid.items(): if dparam["YNGP_M2.theta2_%d" % mod] * dparam["YNGP_M2.theta1_%d" % mod] >= 0.999: continue fback = open(outBackup + "_%d" % mod, "w") [ fback.write(key + "=" + str(val) + "\n") for key, val in dparam.items() if key != "YNGP_M2.theta2_%d" % mod ] fback.write("YNGP_M2.theta2_%d=1\n" % mod) fback.write("YNGP_M2.omega2_%d=1\n" % mod) fback.close() lignore2 = lignore[:] + [ key for key in dparam if key not in ["YNGP_M2.theta1_%d" % mod, "YNGP_M2.omega0_%d" % mod] ] dBppCmd["IGNORE"] = ",".join(lignore2) dBppCmd["params"] = outParams dBppCmd["OUTPARAMS"] = outParams + "_%d" % idi dBppCmd["BACKUP"] = outBackup + "_%d" % mod argsOPB = "bppml \"" + "\" \"".join( [k + "=" + v for k, v in dBppCmd.items()]) + "\"" # logger.debug(argsOPB) runOPB = cmd(argsOPB, False) fback = open(dBppCmd["BACKUP"] + ".def", "r") dparam2 = { param.split("=")[0]: float(param.split("=")[1]) for param in fback.readlines() } fback.close() valM1 = float(dparam2["f(x)"]) LR, p = PSPFunc.LRT(valM2, valM1, 2) fresbranch.write( "%d\t%f\t%f\t%f\t%f\t%f\t%f\n" % (idi, dparam["YNGP_M2.omega2_%d" % mod], (1 - dparam["YNGP_M2.theta2_%d" % mod]) * (1 - dparam["YNGP_M2.theta1_%d" % mod]), valM2, valM1, LR, p)) if p < 0.05: logger.info("Node {:d} is interesting (w = {:f})".format( idi, dparam["YNGP_M2.omega2_%d" % mod])) fresbranch.close() return (outParams)
def bppSite(bppFile, bppMixed, alnFile, alnFormat, treeFile, lModels, outDir, baseName, logger): # outDir=os.getcwd()+"/" # used to debug logger.info(os.getcwd()) ### SITE ANALYSIS: BIO++ logger.info("Bio++ Site Analysis") logger.info("Models to be run: {:s}".format(", ".join( model for model in lModels))) logger.info("Bppml parameter file: {:s}".format(bppFile)) ## Bppml """ Optimize tree and model using bppml Variables to include are INPUTFILE - alignement file FORMAT - format of the aln file (here, phyx) TREEFILE - tree file for the analyzed aln MODEL - choose which model you want run on the data YNGP_M0 through 8, same models as PAML, and DFP07 models IGNORE - parameters to ignore for optimization, for example if one is fixated (ex: omegas in M8a) OUTTREE - name of the optimized output tree OUTPARAMS - name of the output file summarizing parameters BACKUP - name of log file """ # Bppml output file names - dictionaries that associate model number with output file name for the model outSite = outDir + "bpp_site/" if not os.path.exists(outSite): subprocess.Popen("mkdir " + outSite, shell=True).wait() outFileName = outSite + baseName dModelTrees = { model: outFileName + "_" + model + ".dnd" for model in lModels } dModelParams = { model: outFileName + "_" + model + ".params" for model in lModels } dModelLog = { model: outFileName + "_optimization_" + model for model in lModels } dModelSyntax = { model: ["YNGP_" + model, "frequencies=F3X4(initFreqs=observed)"] for model in lModels if model[0] == "M" } # dictionary model number - [MODEL name, MODEL arguments for bppml] dModelSyntax.update({ model: [model[:5], "protmodel=JTT92", "frequencies=F3X4(initFreqs=observed)"] for model in lModels if model[:5] == "DFP07" }) # take into account the specificities of each model (number of classes n for example) for model in lModels: if model in ["M7", "M8"]: dModelSyntax[model].append("n=4") dModelSyntax[model].append("q=1") if model[0] == "M" and len(model) > 2: dModelSyntax[model][0] = dModelSyntax[model][0][:-1] dModelSyntax[model].append("omegas=1") if model[:5] == "DFP07": dModelSyntax[model].append(["p0=1", "p0=0.1"][model == "DFP07"]) dLogLlh = {} # dictionary(model:logllh) for model in lModels: prevmodel, dnewpar = getNewParfromOptim(model, lModels, dModelLog, logger) if prevmodel != "": fnew = open(dModelLog[model], "w") for k, v in dnewpar.items(): fnew.write(k + "=" + v.strip() + "\n") fnew.close() lignore = setIgnoreParams(model, prevmodel, lModels, logger) ignore = ",".join(lignore) else: ignore = "" if model != "M0" and "M0" in lModels: treeFile = dModelTrees["M0"] + "_1" # create dictionary with all elements of the two argument lists to build commands modelDesc = dModelSyntax[model][0] + "(" + ",".join( dModelSyntax[model][1:]) + ")" dBppCmd = { "INPUTFILE": alnFile, "FORMAT": alnFormat, "TREEFILE": treeFile, "MODEL": modelDesc, "IGNORE": ignore, "OUTTREE": dModelTrees[model], "OUTPARAMS": dModelParams[model], "BACKUP": dModelLog[model], "param": bppFile } # running bppml logger.info("Running {:s} optimization".format(model)) # join each couple of the cmd dictionary so that it reads "k1 = v1" "k2 = v2" etc... argsMx = "\"" + "\" \"".join([k + "=" + v for k, v in dBppCmd.items()]) + "\"" logger.debug("bppml " + argsMx) runMx = subprocess.Popen("bppml " + argsMx, shell=True, stdout=subprocess.PIPE).wait() logger.debug(subprocess.PIPE) # fill dictionary with loglikelihoods of each model if os.path.exists(dModelParams[model]): with open(dModelParams[model], "r") as params: dLogLlh[model] = float( params.readline().strip().split("= ")[-1]) params.close() logger.info("Log Likelihood = {}".format(dLogLlh[model])) else: logger.info( "Possible failed optimization, likelihood has not been calculated." ) # perform LRT # M1 vs M2 if "M1" and "M2" in lModels: if "M1" and "M2" in dLogLlh: LR12, p12 = PSPFunc.LRT(dLogLlh["M1"], dLogLlh["M2"], 2) logger.info("LRT of M1 vs M2: {}".format(p12)) else: logger.info( "Possible failed optimization, likelihoods of M1 and M2 have not been computed." ) if "M7" and "M8" in lModels: if "M7" and "M8" in dLogLlh: LR78, p78 = PSPFunc.LRT(dLogLlh["M7"], dLogLlh["M8"], 2) logger.info("LRT of M7 vs M8: {}".format(p78)) else: logger.info( "Possible failed optimization, likelihoods of M7 and M8 have not been computed." ) if "M8" and "M8a" in lModels: if "M8" and "M8a" in dLogLlh: LR88a, p88a = PSPFunc.LRT(dLogLlh["M8a"], dLogLlh["M8"], 1) ts88a = 0.5 * p88a + 0.5 logger.info("LRT of M8 vs M8a: {} (Treshold: {})".format( p88a, ts88a)) else: logger.info( "Possible failed optimization, likelihoods have not been computed." ) if "DFP07" and "DFP07_0" in lModels: if "DFP07" and "DFP07_0" in dLogLlh: LRDFP, pDFP = PSPFunc.LRT(dLogLlh["DFP07_0"], dLogLlh["DFP07"], 1) tsDFP = 0.5 * pDFP + 0.5 logger.info("LRT of DFP07 vs DFP07_07: {} (Treshold: {})".format( pDFP, tsDFP)) else: logger.info( "Possible failed optimization, likelihoods have not been computed." ) # Bppmixedlikelihoods """ Optimize tree and model using bppml Variables to include are INPUTFILE - alignement file FORMAT - format of the aln file (here, phyx) TREEFILE - tree file for the analyzed aln PARAMS - .params file from model optimization (bppml) OUTINFO - name of the results file (info about sites etc.) """ for model in lModels: # use tree optimized in M0 for each model if "M0" in lModels: treeFile = dModelTrees["M0"] + "_1" else: treeFile = dModelTrees[model] + "_1" if model in ["M0", "DFP07_0"]: continue # dictionary(model:results file name) dModelResults = { model: outSite + baseName + "_results_" + model + ".log" for model in lModels } dMixCmd = { "INPUTFILE": alnFile, "FORMAT": alnFormat, "TREEFILE": treeFile, "params": dModelParams[model], "OUTINFO": dModelResults[model], "param": bppMixed } logger.info("Running mixed likelihoods with model {:s}".format(model)) argsMx = "\"" + "\" \"".join([k + "=" + v for k, v in dMixCmd.items()]) + "\"" logger.debug("bppmixedlikelihoods " + argsMx) runMx = subprocess.Popen("bppmixedlikelihoods " + argsMx, shell=True, stdout=subprocess.PIPE).wait() logger.debug(subprocess.PIPE)