def __init__(self, inputTree, distributionTrees=None): """ SuperTreeInputTrees is a utility to create sets of input trees. The input trees are primarily to be used to evaluate super tree construction methods. Invocation removing a fixed number of taxa from each prospective input tree: stit = SuperTreeInputTrees(inputTree) stit.writeInputTreesToFile = True stit.outputFile = 'myInputtrees.tre' stit.noTaxaToRemove = 32 stit.noOutputTrees = 10 stit.generateInputTrees() Invocation using built in distribution gathered from real world super tree cases:: stit = SuperTreeInputTrees(inputTree) stit.writeInputTreesToFile = True stit.outputFile = 'myInputtrees.tre' stit.useTaxonDistribution = True stit.generateInputTrees() The user can generate a distribution of their own by supplying a list of p4 trees or a tree file. The order of the trees is important, supertree and then all other trees. This goes for both list and file. Like so:: stit = SuperTreeInputTrees(inputTree, distributionTrees='myTreefile.nex') stit.writeInputTreesToFile = True stit.outputFile = 'myInputtrees.tre' stit.useTaxonDistribution = True stit.generateInputTrees() Placeholders which allow access to data after completed computations:: stit.outputTrees stit.dist """ self.writeInputTreesToFile = False self.outputFile = 'inputtrees.tre' # Set to False if you want to have a set number of taxa in the output # trees self.useTaxonDistribution = False # Only meaningful if setting useTaxonDistribution = False self.noTaxaToRemove = 32 self.noOutputTrees = 10 gm = ['SuperTreeInputTrees()'] if isinstance(inputTree, Tree): self.inputTree = inputTree # not a list. elif isinstance(inputTree, str): var.trees = [] read(inputTree) if len(var.trees) > 1: gm.append('Sorry, supply only one tree as supertree') raise P4Error(gm) # this was originally a list, ie [var.trees.pop()] self.inputTree = var.trees.pop() else: gm.append("Input tree was neither a p4 Tree nor a valid filename") gm.append("Got %s" % inputTree) raise P4Error(gm) if not self.inputTree._taxNames: self.inputTree._setTaxNamesFromLeaves() self.outputTrees = [] self.normalizedDist = [] # Distributions gathered from real world supertree input # The dists are first a list of input tree taxon set sizes and the supertree taxon set size # Using this data we can normalize the dists to fit the size of trees # we want # BunnyRSVNormal set from Wilkinson et al 2005, Syst Biol 54:823 # self.dist = [[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 13, 14, 14, 15, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 21, 22, 22, 23, 24, 25, 25, 25, 25, 25, 25, 26, 27, 28, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 37, 38, 38, 40, 40, 41, 47, 51, 51, 52, 52, 52, 68, 70, 78, 78, 79, 80, 80], 80] # CanidaeRVS set from Wilkinson et al 2005, Syst Biol 54:823 #self.dist = [[3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 11, 11, 12, 16, 16, 20, 23, 24, 30, 30, 33, 34, 34, 34, 34, 34], 34] # CarnivoraRVS set from Wilkinson et al 2005, Syst Biol 54:823 #self.dist = [[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12], 12] # DavideDinoMRP set from Wilkinson et al 2005, Syst Biol 54:823 #self.dist = [[4, 4, 4, 5, 6, 6, 6, 7, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 20, 20, 20, 22, 23, 23, 24, 24, 25, 26, 27, 27, 28, 28, 29, 29, 29, 29, 29, 30, 30, 30, 31, 31, 31, 31, 33, 33, 33, 33, 36, 37, 37, 38, 38, 39, 42, 45, 47, 48, 50, 53, 53, 66, 70, 71, 74, 74, 75, 75, 76, 78, 78, 80, 86, 86, 92, 94, 96, 100, 101, 102, 102, 103, 105, 110, 111, 111, 139, 148, 149, 153, 173, 199, 204, 217, 240, 269, 270, 271, 272, 272, 273, 273, 273, 273, 274, 275], 277] # FelidaeRVS set from Wilkinson et al 2005, Syst Biol 54:823 self.dist = [[ 3, 3, 3, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7, 7, 7, 7, 9, 9, 10, 10, 14, 16, 17, 24, 25, 28, 29, 29, 30, 30, 32, 34, 36, 36, 36, 36, 36, 36, 36, 36 ], 36] # KennedyPageData set from Wilkinson et al 2005, Syst Biol 54:823 #self.dist = [[14, 16, 17, 20, 30, 30, 90], 122] # ViverridaeRVS set from Wilkinson et al 2005, Syst Biol 54:823 #self.dist = [[4, 5, 10, 16, 19, 33, 34, 34, 34], 34] if distributionTrees: self.useTaxonDistribution = True if isinstance(distributionTrees, list): for t in distributionTrees: if not isinstance(t, Tree): gm.append( "Input trees should be a list of p4 Tree objects. Got %s" % t) raise P4Error(gm) superTree = distributionTrees.pop(0) inputTrees = distributionTrees elif isinstance(distributionTrees, list): var.trees = [] read(distributionTrees) if len(var.trees) < 1: gm.append( 'Sorry, at least one tree must be supplied as input tree' ) raise P4Error(gm) superTree = var.trees.pop(0) inputTrees = var.trees self._generateDistribution(superTree, inputTrees)
def __init__(self, supertree, inputTrees): # There are two ways of decorating the supertree with the support values. # Standard conforms to the consensus tree tradition, i.e. values are presented between # 0 to 100 percent. Non standard adhears to the few supertree papers regarding support values # i.e -1 to 1. self.doStandardDecoration = True # The decorated supertree can be saved to file self.doSaveDecoratedTree = False self.decoratedFilename = 'superTreeSupport.nex' # There is a option to save a supertree decorated with index values instead of support values. # This can then be used with a csv file containing the support values for each index. # Further analysis of the support values can be performed and then matched to the indecies in the # decorated supertree self.doSaveIndexTree = False self.indexFilename = 'supertreeIndex.nex' self.csvFilename = 'supertreeIndex.csv' # Draws the decorated supertree to screen self.doDrawTree = False # Produces output to screen self.verbose = 1 # Placeholders that allows access to the data after completing # calculations self.decoratedSuperTree = None self.indexSuperTree = None self.csvList = None # Keeps track of splits for producing output self.indexIntersections = [] self.csvValues = [] self.intersections = [] # Let t be the number of input trees, # s the number of input trees supporting a supertree clade, # r the number of input trees that are irrelevant to the supertree clade, # q the number of input trees that conflict with the supertree clade, # p the number of input trees that permit the supertree clade, # so that t = p + q + r + s. self.T = 0 # no. of input trees; self.L = 0 # no. of leaves; # coverage (average proportion of leaves in the input tree); self.C = 0.0 self.SC = 0 # number of supertree clades; self.U = 0 # no. of unsupported supertree clades; # no. of unsupported supertree clades that conflict with at least one # input tree; self.UC = 0 # no. of unsupported clades conflicting with all relevant input trees; self.UCC = 0 # average qualitative support for supertree clades. Figures in # parentheses are ranges. self.QS = 0.0 self.S = 0.0 # average support self.P = 0.0 # average permitted self.Q = 0.0 # average conflict self.R = 0.0 # average relevance self.wS = 0.0 # average weighted support self.wP = 0.0 # average weighted permitance self.V = 0.0 # average V for supertree cladesV = (s minus q)/(s + q) self.VV = 0.0 # V+ = (s minus q +p)/(s + q + p) self.Vv = 0.0 # V minus = (s minus q minus p)/(s + q + p) self.wV = 0.0 # wV = (ws minus q)/(ws + q) self.wVV = 0.0 # wVV = (ws minus q +wp)/(ws + q + wp) self.wVv = 0.0 # wVv = (ws minus q minus wp)/(ws + q + wp) gm = ['SuperTreeSupport()'] var.warnReadNoFile = False if isinstance(inputTrees, list): for t in inputTrees: if not isinstance(t, Tree): gm.append( "Input trees should be a list of p4 Tree objects. Got %s" % t) raise P4Error(gm) self.inputTrees = inputTrees elif isinstance(inputTrees, str): var.trees = [] read(inputTrees) if len(var.trees) < 1: gm.append( 'Sorry, at least one tree must be supplied as input tree') raise P4Error(gm) self.inputTrees = var.trees else: gm.append( "Input trees are neither a list of p4 Tree objects nor a valid filename." ) raise P4Error(gm) if isinstance(supertree, Tree): self.supertree = supertree # not a list. elif isinstance(supertree, str): var.trees = [] read(supertree) if len(var.trees) > 1: gm.append('Sorry, supply only one tree as supertree') raise P4Error(gm) # this was originally a list, ie [var.trees.pop()] self.supertree = var.trees.pop() else: gm.append("Supertree was neither a p4 Tree nor a valid filename") gm.append("Got %s" % supertree) raise P4Error(gm) for tree in self.inputTrees: if not tree._taxNames: tree._setTaxNamesFromLeaves() # Mean and median overlap of the input trees overlapList = [] meanOverlap = 0.0 index = 0 for i in range(0, len(self.inputTrees) - 1): for j in range(i + 1, len(self.inputTrees)): overlap = len( set(self.inputTrees[i].taxNames).intersection( set(self.inputTrees[j].taxNames))) overlapList.append(overlap) meanOverlap += overlap index += 1 if index == 0: self.mean = 0 self.median = 0 else: self.mean = meanOverlap / index overlapList.sort() self.median = overlapList[len(overlapList) / 2] commonLeafSet = CommonLeafSet() self.splits = commonLeafSet.updateTreesToCommonLeafSet( [self.inputTrees, [self.supertree]]) self.bitkeys = commonLeafSet.getCommonBitkeys() self.taxnames = commonLeafSet.getCommonTaxNames() self.taxa2Bitkey = commonLeafSet.getCommonTaxa2Bitkey()
def treeFinderMAPAnalysis(alignment, groups, gamma=True, invariant=True, bootstrap=False, nreplicates=100, remove_files=False, run_analysis=True, verbose=False): """ Uses TreeFinder to estimate a Maximum Likelihood tree using the MAP substitution model for grouped amino-acids. - *alignment*: p4 alignment object of original (un-recoded) protein data from which the "groups" are derived - *groups*: list of grouped amino-acids, possibly resuling from :meth:`Alignment.getKosiolAISGroups()` or :meth:`Alignment.getMinmaxChiSqGroups()` - *gamma*: include gamma distribution of among-site rate variation - *bootstrap*: run bootstrap analysis - *nreplicates*: number of bootstrap replicates - *invariant*: include a proportion of invariant sites - *run_analysis*: run the analysis if TreeFinder in $PATH, else just write the control file - *remove_files*: remove analysis files. Only available if run_analysis=True """ gm = ["p4.alignment_recoding.treeFinderMAPAnalysis()"] if not isinstance(alignment, Alignment): msg = "alignment must be a Alignment object" gm.append(msg) raise P4Error(gm) if alignment.dataType != "protein": msg = "alignment should be the original protein data from" + \ "which the groups were defined. Doing nothing." gm.append(msg) raise P4Error(gm) for param in [ gamma, invariant, bootstrap, remove_files, run_analysis, verbose ]: if not isinstance(param, bool): msg = "%s value must be either True or False" % param gm.append(msg) raise P4Error(gm) if not isinstance(nreplicates, int): msg = "nreplictes must be an integer" gm.append(msg) raise P4Error(gm) if run_analysis: if not p4.func.which2("tf"): msg = "tf (treefinder) is not in your $PATH" + \ "Cannot run analysis" gm.append(msg) raise P4Error(gm) datafile_name = "tf_data.phy" #tf commands tls = """ReconstructPhylogeny[ "%(datafile)s", SubstitutionModel->MAP[%(map)s][Optimum,Optimum]%(ifH)s, WithEdgeSupport->%(bootstrap)s%(nreplicates)s ], "%(outfile)s",SaveReport""" od = {} od["datafile"] = datafile_name if gamma: if invariant: od["ifH"] = ":GI[Optimum]" else: od["ifH"] = ":G[Optimum]" else: if invariant: od["ifH"] = ":I[Optimum]" else: od["ifH"] = "" if bootstrap: od["bootstrap"] = "True" od["nreplicates"] = ",NReplicates->%i" % nreplicates else: od["bootstrap"] = "False" od["nreplicates"] = "" od["outfile"] = "tf_reconstruction.output" od["map"] = ",".join( ['"%s"' % i for i in [group.upper() for group in groups]]) if run_analysis: #Write data file alignment.writePhylip(datafile_name) #Write control file tl_file = "tf_control.tl" fh = open(tl_file, "w") fh.write(tls % od) fh.close() if verbose: direct = subprocess.STDOUT else: direct = open("/dev/null", "w") child = subprocess.Popen("tf tf_control.tl", stderr=direct, shell=True) if verbose: print("Running TreeFinder, this could take some time...", end=' ') sys.stdout.flush() child.communicate() if verbose: print("done.") sys.stdout.flush() #This doesnt seem to work, why? #while child.poll() is None: # time.sleep(60) # if verbose: # sys.stdout.write(".") # sys.stdout.flush() if child.returncode != 0: msg = "TreeFinder returned error code %s" gm.append(msg % (child.returncode)) raise P4Error(gm) fh = open(od["outfile"], "r") line = fh.readlines()[1] fh.close() rd = {} #Likelihood rd["Likelihood"] = float(line[line.index("Likelihood->") + 12:line.index(",")]) #Tree ts = line[line.index("Phylogeny->") + 11:line.index("SubstitutionModel->") - 1] rd["Phylogeny"] = ts #SubstitutionModel sm = line[line.index("SubstitutionModel->") + 19:line.index("OSubstitutionModel->") - 1] rd["SubstitutionModel"] = sm #OSubstitutionModel osm = line[line.index("OSubstitutionModel->") + 20:line.index("OEdgeOptimizationOff->") - 1] rd["OSubstitutionModel"] = osm #NSites ns = line[line.index("NSites->") + 8:line.index("NParameters->") - 1] rd["Nsites"] = int(ns) #NParameters np = line[line.index("NParameters->") + 13:line.index("AIC->") - 1] rd["NParameters"] = int(np) #AIC rd["AIC"] = float(line[line.index("AIC->") + 5:line.index("AICc->") - 1]) #AICc-> rd["AICc"] = float(line[line.index("AICc->") + 6:line.index("HQ->") - 1]) #HQ rd["HQ"] = float(line[line.index("HQ->") + 4:line.index("BIC->") - 1]) #BIC rd["BIC"] = float(line[line.index("BIC->") + 5:line.index("Checksum->") - 1]) #LikelihoodTime lt = line[line.index("LikelihoodTime->") + 16:line.index("LikelihoodMemory->") - 1] rd["LikelihoodTime"] = float(lt) #LikelihoodMemory lm = line[line.index("LikelihoodMemory->") + 18:-3] rd["LikelihoodMemory"] = int(lm) #Make a tree object tree = rd["Phylogeny"].replace("{", "(") tree = tree.replace("}", ")") tree = tree.replace("\"", "") tree = tree + ";" if bootstrap: #Tree viewer has the brlen before bootstrap value plus an extra colon # turn "xxx):0.00001:87.999,yyy" into "xxx)87.999:0.00001,yyy" patt = re.compile(r"\):([0-9]+\.[0-9e-]+):([0-9]+\.[0-9e-]*)") repl = r")\2:\1" tree = re.sub(patt, repl, tree) origw = var.warnReadNoFile var.warnReadNoFile = False read(tree) var.warnReadNoFile = origw result_tree = var.trees.pop() if bootstrap: #Round up floats to percentages for node in result_tree.iterInternalsNoRoot(): node.name = "%2.f" % float(node.name) if remove_files: os.remove("tf_control.tl") os.remove("tf_data.phy") os.remove("tf_reconstruction.output") if verbose: print("\n") result_tree.draw() print("\nLikelihood: %.4f\n" % rd["Likelihood"]) return result_tree, rd else: print(tls % od) return (None, None)