def _pruneTreesToComplement(file1, file2): p4.var.trees = [] p4.read(file1) p4.read(file2) tree1 = p4.var.trees[0] tree2 = p4.var.trees[1] # Delete out the missing taxa from trees so that they intersect: tree1_taxa = [n.name for n in tree1.nodes if n.isLeaf] tree2_taxa = [n.name for n in tree2.nodes if n.isLeaf] commontaxa = list(set(tree1_taxa) & set(tree2_taxa)) numberOfSharedTaxa = len(commontaxa) if len(commontaxa) > 3: delete_from_tree1 = set([t for t in tree1_taxa if t not in tree2_taxa]) delete_from_tree2 = set([t for t in tree2_taxa if t not in tree1_taxa]) for t in delete_from_tree1: tree1.removeNode(t, alsoRemoveBiRoot=False) for t in delete_from_tree2: tree2.removeNode(t, alsoRemoveBiRoot=False) if len([n.name for n in tree1.nodes if n.isLeaf]) != \ len([n.name for n in tree2.nodes if n.isLeaf]): raise TCTError, 'Something unexpected went wrong with the taxon ' + \ 'pruning. After pruning trees have different number of taxa.' t1 = tree1.writeNewick(toString=True).split('\n')[0] t2 = tree2.writeNewick(toString=True).split('\n')[0] return (t1, t2, numberOfSharedTaxa) else: return ('nope', 'nope', 'nope')
def __init__(self, inal=None): ''' Instantiate an object intended to wrap an alignment for the purposes of running phylogenetic inference. :param inal: An alignment file path (most formats are accepted). ''' # Intergrate the p4 phylogenetic library. if inal is None: self.data = p4.Alignment() else: p4.read(inal) # Read the alignment file/string. self.data = p4.var.alignments[-1] # Augment alignment with a discrete state model. self.model = model.DiscreteStateModel(self) # Keep track of temporary files. self.paths = {}
def __init__(self,inal=None): ''' Instantiate an object intended to wrap an alignment for the purposes of running phylogenetic inference. :param inal: An alignment file path (most formats are accepted). ''' # Intergrate the p4 phylogenetic library. if inal is None: self.data = p4.Alignment() else: p4.read(inal) # Read the alignment file/string. self.data = p4.var.alignments[-1] # Augment alignment with a discrete state model. self.model = model.DiscreteStateModel(self) # Keep track of temporary files. self.paths = {}
def setuploop(file_1, file_2, outfile): p4.var.trees = [] p4.read(file_1) fileCount1 = len(p4.var.trees) p4.read(file_2) fileCount2 = len(p4.var.trees) - fileCount1 for i in range(0, fileCount1): for j in range(fileCount1, fileCount1 + fileCount2): temp_tree1 = copy.deepcopy(p4.var.trees[i]) temp_tree2 = copy.deepcopy(p4.var.trees[j]) tree1, tree2, numberOfTaxa = _pruneTreesToComplement( temp_tree1, temp_tree2) if tree1 != 'nope': t1 = dendropy.Tree.get_from_string(tree1, 'newick') t2 = dendropy.Tree.get_from_string(tree2, 'newick') symdiff = t1.symmetric_difference(t2) #/ len(t1.nodes()) towrite2 = file_1 + '\t' + file_2 + '\t' + str( symdiff) + '\t' + str(numberOfTaxa) + '\n' with open(outfile, 'a') as out2: out2.write(towrite2)
def treeFinderMAPAnalysis(alignment, groups, gamma=True, invariant=True, bootstrap=False, nreplicates=100, remove_files=False, run_analysis=True, verbose=False): """ Uses TreeFinder to estimate a Maximum Likelihood tree using the MAP substitution model for grouped amino-acids. - *alignment*: p4 alignment object of original (un-recoded) protein data from which the "groups" are derived - *groups*: list of grouped amino-acids, possibly resuling from :meth:`Alignment.getKosiolAISGroups()` or :meth:`Alignment.getMinmaxChiSqGroups()` - *gamma*: include gamma distribution of among-site rate variation - *bootstrap*: run bootstrap analysis - *nreplicates*: number of bootstrap replicates - *invariant*: include a proportion of invariant sites - *run_analysis*: run the analysis if TreeFinder in $PATH, else just write the control file - *remove_files*: remove analysis files. Only available if run_analysis=True """ gm = ["p4.alignment_recoding.treeFinderMAPAnalysis()"] if not isinstance(alignment, Alignment): msg = "alignment must be a Alignment object" gm.append(msg) raise P4Error(gm) if alignment.dataType != "protein": msg = "alignment should be the original protein data from" + \ "which the groups were defined. Doing nothing." gm.append(msg) raise P4Error(gm) for param in [gamma, invariant, bootstrap, remove_files, run_analysis, verbose]: if not isinstance(param, types.BooleanType): msg = "%s value must be either True or False" % param gm.append(msg) raise P4Error(gm) if not isinstance(nreplicates, types.IntType): msg = "nreplictes must be an integer" gm.append(msg) raise P4Error(gm) if run_analysis: if not func.which2("tf"): msg = "tf (treefinder) is not in your $PATH" + \ "Cannot run analysis" gm.append(msg) raise P4Error(gm) datafile_name = "tf_data.phy" #tf commands tls = """ReconstructPhylogeny[ "%(datafile)s", SubstitutionModel->MAP[%(map)s][Optimum,Optimum]%(ifH)s, WithEdgeSupport->%(bootstrap)s%(nreplicates)s ], "%(outfile)s",SaveReport""" od = {} od["datafile"] = datafile_name if gamma: if invariant: od["ifH"] = ":GI[Optimum]" else: od["ifH"] = ":G[Optimum]" else: if invariant: od["ifH"] = ":I[Optimum]" else: od["ifH"] = "" if bootstrap: od["bootstrap"] = "True" od["nreplicates"] = ",NReplicates->%i" % nreplicates else: od["bootstrap"] = "False" od["nreplicates"] = "" od["outfile"] = "tf_reconstruction.output" od["map"] = ",".join(['"%s"' % i for i in [group.upper() for group in groups]]) if run_analysis: #Write data file alignment.writePhylip(datafile_name) #Write control file tl_file = "tf_control.tl" fh = open(tl_file, "w") fh.write(tls % od) fh.close() if verbose: direct = subprocess.STDOUT else: direct = open("/dev/null", "w") child = subprocess.Popen("tf tf_control.tl", stderr=direct, shell=True) if verbose: print "Running TreeFinder, this could take some time...", sys.stdout.flush() child.communicate() if verbose: print "done." sys.stdout.flush() #This doesnt seem to work, why? #while child.poll() is None: # time.sleep(60) # if verbose: # sys.stdout.write(".") # sys.stdout.flush() if child.returncode != 0: msg = "TreeFinder returned error code %s" gm.append(msg % (child.returncode)) raise P4Error(gm) fh = open(od["outfile"], "r") line = fh.readlines()[1] fh.close() rd = {} #Likelihood rd["Likelihood"] = float(line[line.index("Likelihood->")+12:line.index(",")]) #Tree ts = line[line.index("Phylogeny->")+11:line.index("SubstitutionModel->")-1] rd["Phylogeny"] = ts #SubstitutionModel sm = line[line.index("SubstitutionModel->")+19:line.index("OSubstitutionModel->")-1] rd["SubstitutionModel"] = sm #OSubstitutionModel osm = line[line.index("OSubstitutionModel->")+20:line.index("OEdgeOptimizationOff->")-1] rd["OSubstitutionModel"] = osm #NSites ns = line[line.index("NSites->")+8:line.index("NParameters->")-1] rd["Nsites"] = int(ns) #NParameters np = line[line.index("NParameters->")+13:line.index("AIC->")-1] rd["NParameters"] = int(np) #AIC rd["AIC"] = float(line[line.index("AIC->")+5:line.index("AICc->")-1]) #AICc-> rd["AICc"] = float(line[line.index("AICc->")+6:line.index("HQ->")-1]) #HQ rd["HQ"] = float(line[line.index("HQ->")+4:line.index("BIC->")-1]) #BIC rd["BIC"] = float(line[line.index("BIC->")+5:line.index("Checksum->")-1]) #LikelihoodTime lt = line[line.index("LikelihoodTime->")+16:line.index("LikelihoodMemory->")-1] rd["LikelihoodTime"] = float(lt) #LikelihoodMemory lm = line[line.index("LikelihoodMemory->")+18:-3] rd["LikelihoodMemory"] = int(lm) #Make a tree object tree = rd["Phylogeny"].replace("{", "(") tree = tree.replace("}", ")") tree = tree.replace("\"", "") tree = tree + ";" if bootstrap: #Tree viewer has the brlen before bootstrap value plus an extra colon # turn "xxx):0.00001:87.999,yyy" into "xxx)87.999:0.00001,yyy" patt = re.compile(r"\):([0-9]+\.[0-9e-]+):([0-9]+\.[0-9e-]*)") repl = r")\2:\1" tree = re.sub(patt, repl, tree) origw = var.warnReadNoFile var.warnReadNoFile = False read(tree) var.warnReadNoFile = origw result_tree = var.trees.pop() if bootstrap: #Round up floats to percentages for node in result_tree.iterInternalsNoRoot(): node.name = "%2.f" % float(node.name) if remove_files: os.remove("tf_control.tl") os.remove("tf_data.phy") os.remove("tf_reconstruction.output") if verbose: print "\n" result_tree.draw() print "\nLikelihood: %.4f\n" % rd["Likelihood"] return result_tree, rd else: print tls % od return (None, None)
def treeFinderMAPAnalysis(alignment, groups, gamma=True, invariant=True, bootstrap=False, nreplicates=100, remove_files=False, run_analysis=True, verbose=False): """ Uses TreeFinder to estimate a Maximum Likelihood tree using the MAP substitution model for grouped amino-acids. - *alignment*: p4 alignment object of original (un-recoded) protein data from which the "groups" are derived - *groups*: list of grouped amino-acids, possibly resuling from :meth:`Alignment.getKosiolAISGroups()` or :meth:`Alignment.getMinmaxChiSqGroups()` - *gamma*: include gamma distribution of among-site rate variation - *bootstrap*: run bootstrap analysis - *nreplicates*: number of bootstrap replicates - *invariant*: include a proportion of invariant sites - *run_analysis*: run the analysis if TreeFinder in $PATH, else just write the control file - *remove_files*: remove analysis files. Only available if run_analysis=True """ gm = ["p4.alignment_recoding.treeFinderMAPAnalysis()"] if not isinstance(alignment, Alignment): msg = "alignment must be a Alignment object" gm.append(msg) raise P4Error(gm) if alignment.dataType != "protein": msg = "alignment should be the original protein data from" + \ "which the groups were defined. Doing nothing." gm.append(msg) raise P4Error(gm) for param in [ gamma, invariant, bootstrap, remove_files, run_analysis, verbose ]: if not isinstance(param, bool): msg = "%s value must be either True or False" % param gm.append(msg) raise P4Error(gm) if not isinstance(nreplicates, int): msg = "nreplictes must be an integer" gm.append(msg) raise P4Error(gm) if run_analysis: if not func.which2("tf"): msg = "tf (treefinder) is not in your $PATH" + \ "Cannot run analysis" gm.append(msg) raise P4Error(gm) datafile_name = "tf_data.phy" #tf commands tls = """ReconstructPhylogeny[ "%(datafile)s", SubstitutionModel->MAP[%(map)s][Optimum,Optimum]%(ifH)s, WithEdgeSupport->%(bootstrap)s%(nreplicates)s ], "%(outfile)s",SaveReport""" od = {} od["datafile"] = datafile_name if gamma: if invariant: od["ifH"] = ":GI[Optimum]" else: od["ifH"] = ":G[Optimum]" else: if invariant: od["ifH"] = ":I[Optimum]" else: od["ifH"] = "" if bootstrap: od["bootstrap"] = "True" od["nreplicates"] = ",NReplicates->%i" % nreplicates else: od["bootstrap"] = "False" od["nreplicates"] = "" od["outfile"] = "tf_reconstruction.output" od["map"] = ",".join( ['"%s"' % i for i in [group.upper() for group in groups]]) if run_analysis: #Write data file alignment.writePhylip(datafile_name) #Write control file tl_file = "tf_control.tl" fh = open(tl_file, "w") fh.write(tls % od) fh.close() if verbose: direct = subprocess.STDOUT else: direct = open("/dev/null", "w") child = subprocess.Popen("tf tf_control.tl", stderr=direct, shell=True) if verbose: print("Running TreeFinder, this could take some time...", end=' ') sys.stdout.flush() child.communicate() if verbose: print("done.") sys.stdout.flush() #This doesnt seem to work, why? #while child.poll() is None: # time.sleep(60) # if verbose: # sys.stdout.write(".") # sys.stdout.flush() if child.returncode != 0: msg = "TreeFinder returned error code %s" gm.append(msg % (child.returncode)) raise P4Error(gm) fh = open(od["outfile"], "r") line = fh.readlines()[1] fh.close() rd = {} #Likelihood rd["Likelihood"] = float(line[line.index("Likelihood->") + 12:line.index(",")]) #Tree ts = line[line.index("Phylogeny->") + 11:line.index("SubstitutionModel->") - 1] rd["Phylogeny"] = ts #SubstitutionModel sm = line[line.index("SubstitutionModel->") + 19:line.index("OSubstitutionModel->") - 1] rd["SubstitutionModel"] = sm #OSubstitutionModel osm = line[line.index("OSubstitutionModel->") + 20:line.index("OEdgeOptimizationOff->") - 1] rd["OSubstitutionModel"] = osm #NSites ns = line[line.index("NSites->") + 8:line.index("NParameters->") - 1] rd["Nsites"] = int(ns) #NParameters np = line[line.index("NParameters->") + 13:line.index("AIC->") - 1] rd["NParameters"] = int(np) #AIC rd["AIC"] = float(line[line.index("AIC->") + 5:line.index("AICc->") - 1]) #AICc-> rd["AICc"] = float(line[line.index("AICc->") + 6:line.index("HQ->") - 1]) #HQ rd["HQ"] = float(line[line.index("HQ->") + 4:line.index("BIC->") - 1]) #BIC rd["BIC"] = float(line[line.index("BIC->") + 5:line.index("Checksum->") - 1]) #LikelihoodTime lt = line[line.index("LikelihoodTime->") + 16:line.index("LikelihoodMemory->") - 1] rd["LikelihoodTime"] = float(lt) #LikelihoodMemory lm = line[line.index("LikelihoodMemory->") + 18:-3] rd["LikelihoodMemory"] = int(lm) #Make a tree object tree = rd["Phylogeny"].replace("{", "(") tree = tree.replace("}", ")") tree = tree.replace("\"", "") tree = tree + ";" if bootstrap: #Tree viewer has the brlen before bootstrap value plus an extra colon # turn "xxx):0.00001:87.999,yyy" into "xxx)87.999:0.00001,yyy" patt = re.compile(r"\):([0-9]+\.[0-9e-]+):([0-9]+\.[0-9e-]*)") repl = r")\2:\1" tree = re.sub(patt, repl, tree) origw = var.warnReadNoFile var.warnReadNoFile = False read(tree) var.warnReadNoFile = origw result_tree = var.trees.pop() if bootstrap: #Round up floats to percentages for node in result_tree.iterInternalsNoRoot(): node.name = "%2.f" % float(node.name) if remove_files: os.remove("tf_control.tl") os.remove("tf_data.phy") os.remove("tf_reconstruction.output") if verbose: print("\n") result_tree.draw() print("\nLikelihood: %.4f\n" % rd["Likelihood"]) return result_tree, rd else: print(tls % od) return (None, None)
in_file = argv[1] n = argv[2] # don't check for empty sequences or sites since # p4 does not consider those in the test anyways p4.var.doCheckForAllGapColumns = False print( ''' ========== calculating test stats for {} ========== '''.format(in_file) ) p4.read(in_file) a = p4.var.alignments[0] dm = a.pDistances() t = dm.bionj() d = p4.Data() t.data = d t.newComp(free=1, spec='empirical') t.newRMatrix(free=1, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=0, val=0.0) t.optLogLike() t.name = 'homogOpt' t.tPickle()
print "usage: "+sys.argv[0]+" <starting tree>" sys.exit(0) def f5(seq, idfun=None): if idfun is None: def idfun(x): return x seen = {} result = [] for item in seq: marker = idfun(item) if marker in seen: continue seen[marker] = 1 result.append(item) return result p4.read(sys.argv[1]) t = p4.var.trees[0] di = [] alt = True alt2 = False for i in range(10000): d = t.dupe() if alt == True: d.nni() alt = False else: if alt2 == False: d.randomSpr() alt2 = True else: d.nni()
import pandas as pd print ''' The order of the arguments is: - test set file - log file - phylograms file ''' print sys.argv test_set_file = sys.argv[1] phylogs_file = sys.argv[2] p4.read(test_set_file) a = p4.var.alignments[0] p4.read(phylogs_file) for i in range(len(p4.var.trees)): print i t = p4.var.trees[i] t.data = p4.Data(a) t.newComp(free = 1, spec = 'equal') t.newRMatrix(free = 0, spec = 'ones') t.setNGammaCat(partNum = 0, nGammaCat=1) # t.newGdasrv(partNum=0,free = 1, val = gamma.ix[i]) t.setPInvar(free = 0, val = 0.0) t.calcLogLike() t.model = None