def makeRandomTree(names=list(string.lowercase), contract_seuil=0, feature_to_contract='support', random_branches=False): """Make a random Gene Tree""" tree = TreeClass() tree.populate( len(names), names_library=names, random_branches=random_branches) tree.contract_tree(seuil=contract_seuil, feature=feature_to_contract) return tree
def executePipe(tree, nxsfile=None, fasta=None, al=0, type=None, treefile=None): n=[] for leaf in tree: if(len(n)<7): n.append(leaf.name) tree.prune(n) if(treefile is not None): tree=TreeClass(treefile) else: try: tree.write(format=0, outfile="tree.nw"); treefile="tree.nw" except Exception as e: print e print "Can't write tree to 'tree.nw'" if not isinstance(tree, TreeClass): raise ValueError ("You sould use a TreeNode instance") if(nxsfile is None): if fasta is None: print print "WRITING your sequence into a fasta file" tree.writeSeqToFasta(comment=0) fasta="seq.fasta" nxsfile=write_al_in_nxs_file(fasta, al=al) executePhyML(nxsfile, treefile)
def getRFval(refTree_path, tree_path, unroot=False): refTree = TreeClass(refTree_path) tree = TreeClass(tree_path) if(unroot): refTree.unroot() rf, max_rf, c, p1, p2 = refTree.robinson_foulds( tree, unrooted_trees=unroot) return rf, max_rf
def fetch_ensembl_genetree_by_member(memberID=None, species=None, id_type=None, output="nh", nh_format="full"): """Fetch genetree from a member ID :argument memberID: the ensembl gene ID member of the tree to fetch, this is mandatory! EX: ENSG00000157764 :argument species: Registry name/aliases used to restrict searches by. Only required if a stable ID is not unique to a species (not the case with Ensembl databases) EX: human, homo_sapiens :argument id_type: Object type to restrict searches to. Used when a stable ID is not unique to a single class. EX: gene, transcript :argument output: nh / phyloxml, type of output we are looking for! :argument nh_format: full / display_label_composite / simple / species / species_short_name / ncbi_taxon / ncbi_name / njtree / phylip, The format of the nh output, only useful when the output is set to nh """ if not memberID: raise valueError('Please provide a genetree id') else: http = httplib2.Http(".cache") server = "http://rest.ensembl.org" ext = "/genetree/member/id/%s?" % (memberID) if species: ext = ext + "species=" + species + ";" if id_type: ext = ext + "object_type=" + id_type + ";" if (output == "nh"): ext = ext + "nh_format=%s;" % nh_format output = "text/x-" + output resp, content = http.request(server + ext, method="GET", headers={"Content-Type": output}) if not resp.status == 200: print("Invalid response: ", resp.status) raise ValueError('Failled to process request!') if (output.lower() != "text/x-phyloxml"): return TreeClass(content) else: return getTreeFromPhyloxml(content)
def fetch_ensembl_genetree_by_id(treeID=None, aligned=0, sequence="none", output="nh", nh_format="full"): """Fetch genetree from ensembl tree ID :argument treeID: the ensembl tree ID, this is mandatory :argument aligned: boolean (0/1), used with sequence to retrieve aligned sequence :argument sequence: none / protein /cdna /gene, should we retrieve sequence also?, work only with phyloxml nh_format :argument output: nh / phyloxml, type of output we are looking for! :argument nh_format: full / display_label_composite / simple / species / species_short_name / ncbi_taxon / ncbi_name / njtree / phylip, The format of the nh output, only useful when the output is set to nh """ if not treeID: raise valueError('Please provide a genetree id') else: #http = httplib2.Http(".cache") server = "http://rest.ensembl.org" ext = "/genetree/id/%s?sequence=%s;aligned=%i" % (treeID, sequence, aligned) if (output == "nh"): ext = ext + ";nh_format=%s" % nh_format output = "text/x-" + output request = urllib2.Request(server + ext, headers={"Content-Type": output}) resp = urllib2.urlopen(request) content = resp.read() #resp, content = http.request(server+ext, method="GET", headers={"Content-Type":output}) if not resp.getcode() == 200: print("Invalid response: ", resp.getcode()) raise ValueError('Failled to process request!') if (output.lower() != "text/x-phyloxml"): return TreeClass(content) else: return getTreeFromPhyloxml(content)
def condense_node_order(matrice, smallest_index, node_order, method='upgma'): """ condenses two nodes in node_order based on smallest_index info This function is used to create a tree while condensing a matrice with the condense_matrix function. The smallest_index is retrieved with find_smallest_index. The first index is replaced with a node object that combines the two nodes corresponding to the indices in node order. The second index in smallest_index is replaced with None. Also sets the branch length of the nodes to 1/2 of the distance between the nodes in the matrice""" index1, index2 = smallest_index node1 = node_order[index1] node2 = node_order[index2] # get the distance between the nodes and assign 1/2 the distance to the # Length property of each node if(method.lower() == 'nj'): dist = paired_node_distance(matrice, smallest_index) elif(method.lower() == 'upgma'): distance = matrice[index1, index2] dist = (distance / 2.0, distance / 2.0) else: dist = (0, 0) nodes = [node1, node2] pos = [0, 1] for ind in pos: nodes[ind].add_features(length=dist[ind]) # combine the two nodes into a new TreeNode object new_node = TreeClass() new_node.add_child(node1) new_node.add_child(node2) new_node.add_features(length=sum(dist)) # replace the object at index1 with the combined node node_order[index2] = new_node # replace the object at index2 with None del node_order[index1] # distance at i=index2 || j=index2 return node_order
def makeRandomTree(names=list(string.lowercase), contract_seuil=0, feature_to_contract='support', random_branches=False): """Make a random Gene Tree""" tree = TreeClass() tree.populate( len(names), names_library=names, random_branches=random_branches) tree.contract_tree(seuil=contract_seuil, feature=feature_to_contract) return tree
def retrieveDupAndLostCost(treefile, streefile, smap, sep=None, pos='prefix'): genetree = TreeClass(treefile) specietree = TreeClass(streefile) regexmap = {} speciemap = {} with open(smap, 'rU') if isinstance(smap, basestring) else smap as INPUT: for line in INPUT: g, s = line.strip().split() g_regex = re.compile(g.replace('*', '.*')) regexmap[g_regex] = s for leaf in genetree: for key, value in regexmap.iteritems(): if key.match(leaf.name): speciemap[leaf.name] = value genetree.set_species(speciesMap=speciemap, sep=sep, pos=pos) lcamap = TreeUtils.lcaMapping(genetree, specietree) TreeUtils.reconcile(genetree, lcaMap=lcamap, lost="yes") # print genetree.get_ascii(show_internal=True, attributes=['name', 'type']) return TreeUtils.computeDLScore(genetree)
def retrieveDupAndLostCost(treefile, streefile, smap, sep=None, pos='prefix'): genetree = TreeClass(treefile) specietree = TreeClass(streefile) regexmap = {} speciemap = {} with open(smap, 'rU') if isinstance(smap, basestring) else smap as INPUT: for line in INPUT: g, s = line.strip().split() g_regex = re.compile(g.replace('*', '.*')) regexmap[g_regex] = s for leaf in genetree: for key, value in regexmap.iteritems(): if key.match(leaf.name): speciemap[leaf.name] = value genetree.set_species(speciesMap=speciemap, sep=sep, pos=pos) lcamap = TreeUtils.lcaMapping(genetree, specietree) TreeUtils.reconcile(genetree, lcaMap=lcamap, lost="yes") # print genetree.get_ascii(show_internal=True, attributes=['name', 'type']) return TreeUtils.computeDLScore(genetree)
def getRFval(refTree_path, tree_path, unroot=False): refTree = TreeClass(refTree_path) tree = TreeClass(tree_path) if (unroot): refTree.unroot() rf, max_rf, c, p1, p2 = refTree.robinson_foulds(tree, unrooted_trees=unroot) return rf, max_rf
def polySolverPreprocessing(genetree, specietree, distance_file, capitalize=False, gene_sep = None, specie_pos="postfix", dist_diagonal=1e305, nFlag=False): ################################################################# #TODO : # 1) Correct newick # 2) Sequence retrieve # 3) PhyML to align sequence and make a distance matrice # ################################################################# #genetree input if isinstance(genetree, basestring): genetree, gene_sep=newick_preprocessing(genetree, gene_sep) genetree= TreeClass(genetree) genetree.set_species(sep=gene_sep, capitalize=capitalize, pos=specie_pos) #specietree input if isinstance(specietree, basestring): specietree, sep=newick_preprocessing(specietree, '') specietree= TreeClass(specietree) specietree.label_internal_node() #distance matrice input if(distance_file): gene_matrix, node_order= clu.distMatProcessor(distance_file, dist_diagonal, nFlag) #Difference check 1 if set(node_order).difference(set(genetree.get_leaf_names())): reset_node_name(genetree, gene_sep) else: node_order= genetree.get_leaf_names() gene_matrix= clu.makeFakeDstMatrice(len(node_order), 0, 1, dist_diagonal) #Alternative, retrieve aligned sequence and run phyML #Find list of species not in genetree specieGeneList= set(genetree.get_leaf_species()) specieList= set([x.name for x in specietree.get_leaves()]) if(specieGeneList-specieList): raise Exception("Species in genetree but not in specietree : %s" %(", ".join(specieGeneList-specieList))) return genetree, specietree, gene_matrix, node_order
def getTreeFromPhyloxml(xml, saveToFile="default.xml", delFile=True): """ Read a phylogeny tree from a phyloxml string and return a TreeClass object or a list of TreeClass object """ project = Phyloxml() fo = open(saveToFile, "w+") fo.write(xml) fo.close() project.build_from_file(saveToFile) treeList = [] for tree in project.get_phylogeny(): treeList.append(TreeClass.import_from_PhyloxmlTree(tree)) if (delFile): os.remove(saveToFile) if len(treeList) == 1: return treeList[0] return treeList
def getTreeFromPhyloxml(xml, saveToFile="default.xml", delFile=True): """ Read a phylogeny tree from a phyloxml string and return a TreeClass object or a list of TreeClass object """ project = Phyloxml() fo=open(saveToFile, "w+") fo.write(xml) fo.close() project.build_from_file(saveToFile) treeList=[] for tree in project.get_phylogeny(): treeList.append(TreeClass.import_from_PhyloxmlTree(tree)) if(delFile): os.remove(saveToFile) if len(treeList)==1: return treeList[0] return treeList
def condense_node_order(matrice, smallest_index, node_order, method='upgma'): """ condenses two nodes in node_order based on smallest_index info This function is used to create a tree while condensing a matrice with the condense_matrix function. The smallest_index is retrieved with find_smallest_index. The first index is replaced with a node object that combines the two nodes corresponding to the indices in node order. The second index in smallest_index is replaced with None. Also sets the branch length of the nodes to 1/2 of the distance between the nodes in the matrice""" index1, index2 = smallest_index node1 = node_order[index1] node2 = node_order[index2] # get the distance between the nodes and assign 1/2 the distance to the # Length property of each node if (method.lower() == 'nj'): dist = paired_node_distance(matrice, smallest_index) elif (method.lower() == 'upgma'): distance = matrice[index1, index2] dist = (distance / 2.0, distance / 2.0) else: dist = (0, 0) nodes = [node1, node2] pos = [0, 1] for ind in pos: nodes[ind].add_features(length=dist[ind]) # combine the two nodes into a new TreeNode object new_node = TreeClass() new_node.add_child(node1) new_node.add_child(node2) new_node.add_features(length=sum(dist)) # replace the object at index1 with the combined node node_order[index2] = new_node # replace the object at index2 with None del node_order[index1] # distance at i=index2 || j=index2 return node_order
def sample_event_on_branches(self, time, spnode, birth, death, transfer, gnode=None, keeplosses=False, ign_suc_trn=False, ecounter={}): """Simulate a reconstructed birth death tree""" # we are going with a poisson process # so the rate of having an event is # just the sum of rate event_rate = float(birth + death + transfer) died = set() transfered = {} map_to_spec = {} # create starting node if one is not given if gnode is None: gnode = TreeClass() map_to_spec[gnode] = spnode def event_in_time(time, node, spnode): # time for an event if event_rate == 0.0: next_t = INF else: next_t = random.expovariate(event_rate) if next_t > time: # no event on branch # we can stop node.dist = time node.add_features(type=INF) else: eprob = random.random() node.dist = next_t if eprob < birth * 1.0 / event_rate: # birth ==> duplication event cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) # compute event on the remaining time cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) node.add_features(type=TreeClass.AD) ecounter['dup'] += 1 elif eprob < (birth + death) * 1.0 / event_rate: # death happen ==> loss node.add_features(type=TreeClass.LOST) map_to_spec[node] = spnode ecounter['loss'] += 1 died.add(node) else: # give gene to another species ==> transfer contemp_transfer_nodes = list( spnode.get_incomparable_list(timeconsistent=True, wtime=next_t)) if contemp_transfer_nodes and not(ign_suc_trn and node.up and node.up.has_feature('type', name=TreeClass.TRANSFER)): cand_receiver = random.choice(contemp_transfer_nodes) node.add_features(type=TreeClass.TRANSFER) ecounter['transfer'] += 1 cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) cnode = TreeClass() node.add_child(cnode) cnode.add_features(transfered=True) transfered[cnode] = cand_receiver map_to_spec[cnode] = cand_receiver t = cand_receiver.brlen - spnode.brlen + time - next_t event_in_time(t, cnode, cand_receiver) else: # keep node as it is # so speciation node.add_features(type=INF) self.debug_msg("Could not perform transfer at node") self.debug_msg(node) event_in_time(time, gnode, spnode) if not keeplosses: leaves = set(gnode.get_leaves()) - died if len(leaves) == 0: gnode.add_features(type=TreeClass.LOST) died.add(gnode) else: gnode.prune(leaves) gnode.delete_single_child_internal() return gnode, died, transfered, map_to_spec
def polySolverPreprocessing(genetree, specietree, distance_mat, capitalize=False, gene_sep=None, specie_pos="postfix", nFlagVal=1e305, nFlag=False, smap=None, errorproof=False): """Preprocess genetree for polytomysolver """ # genetree input speciemap = None if isinstance(genetree, basestring) and not smap: genetree, gene_sep = newickPreprocessing(genetree, gene_sep) genetree = TreeClass(genetree) elif smap: if isinstance(smap, dict): speciemap = smap else: genetree = TreeClass(genetree) if isinstance( genetree, basestring) else genetree regexmap = {} speciemap = {} with open(smap, 'rU') if isinstance(smap, basestring) else smap as INPUT: for line in INPUT: g, s = line.strip().split() if ('*') in g and '.*' not in g: g = g.replace('*', '.*') g_regex = re.compile(g, re.IGNORECASE) regexmap[g_regex] = s for leaf in genetree: for key, value in regexmap.iteritems(): if key.match(leaf.name): speciemap[leaf.name] = value genetree.set_species(speciesMap=speciemap, sep=gene_sep, capitalize=capitalize, pos=specie_pos) # genetree check if len(genetree) != len(set(genetree.get_leaf_names())): tmp_leaf_name = genetree.get_leaf_names() duplicates = set( [x for x in tmp_leaf_name if tmp_leaf_name.count(x) > 1]) raise ValueError( "Your polytomy contains the following gene multiple times : %s" % ", ".join(duplicates)) # specietree input if isinstance(specietree, basestring): specietree, sep = newickPreprocessing(specietree, '') specietree = TreeClass(specietree) specietree.label_internal_node() # distance matrice input if (distance_mat): if isinstance(distance_mat, basestring): gene_matrix, node_order = clu.distMatProcessor( distance_mat, nFlagVal, nFlag) else: # distance mat is provided as a boolean # in that case, just try to get it from the genetree gene_matrix, node_order = get_distance_from_tree(genetree) # Difference check 1 #pos = node_order.index('ENSDORP00000008194_dordii') #print node_order #print gene_matrix[pos, :] listerr = set(node_order).symmetric_difference( set(genetree.get_leaf_names())) if listerr: if not errorproof: raise ValueError( "Different genes in distance matrix and genetree\n : See symmetric difference : %s\n" % ", ".join(listerr)) else: if gene_sep: resetNodeName(genetree, gene_sep, specie_pos == 'postfix') else: exib1 = set(node_order).difference( set(genetree.get_leaf_names())) exib2 = set(genetree.get_leaf_names()).difference( set(node_order)) if exib2: raise Exception( 'Genes in trees and not in matrix : %s' % (exib2)) elif exib1: print( "Genes in matrix and not in tree : %s \nAttempt to correct distance matrix" % (", ".join(exib1))) for l in exib1: try: lpos = node_order.index(l) gene_matrix = clu.remove_ij( gene_matrix, lpos, lpos) del node_order[lpos] except: raise IndexError( "Could not remove gene %s from distance matrix" % l) else: # This is for debug, will never happen raise ValueError( "distance matrix not provided and could not be infered from tree") #gene_matrix = clu.makeFakeDstMatrice(len(node_order), 0, 1) # Find list of species in genetree but not in specietree specieGeneList = set(genetree.get_leaf_species()) specieList = set([x.name for x in specietree.get_leaves()]) if (specieGeneList - specieList): if len(specieGeneList.intersection(specieList)) == 0 and gene_sep: raise Exception( "*** You probably didn't set the correct species position for you input tree !!" ) raise Exception("Species in genetree but not in specietree : %s" % (", ".join(specieGeneList - specieList))) return genetree, specietree, gene_matrix, node_order
def reconcile(geneTree=None, lcaMap=None, lost="no"): """Reconcile genetree topology to a specieTree, using an adequate mapping obtained with lcaMapping. 'reconcile' will infer evolutionary events like gene lost, gene speciation and gene duplication with distinction between AD and NAD """ if(map is None or geneTree is None): raise Exception("lcaMapping or geneTree not found") else : for node in geneTree.traverse("levelorder"): node.add_features(type=TreeClass.SPEC) #print node.name , node.species, " and children name ", node.get_children_name()," and children species ", node.get_children_species() if(not node.is_leaf() and (lcaMap[node]==lcaMap[node.get_child_at(0)] or lcaMap[node]==lcaMap[node.get_child_at(1)])): node.type=TreeClass.AD #print "\n\nnode = ", node, "\n\nand children : ", node.children if not (set(node.get_child_at(0).get_species()).intersection(set(node.get_child_at(1).get_species()))): node.type=TreeClass.NAD if(lost.upper()=="YES"): for node in geneTree.traverse("postorder"): children_list=node.get_children() for child_c in children_list: if((lcaMap[child_c].up != lcaMap[node] and lcaMap[child_c] != lcaMap[node]) or (node.type==TreeClass.AD and lcaMap[node]!=lcaMap[child_c])): while((lcaMap[child_c].up!=lcaMap[node] and node.type==TreeClass.SPEC) or (lcaMap[child_c]!=lcaMap[node] and node.type!=TreeClass.SPEC)): lostnode=TreeClass() intern_lost=TreeClass() intern_lost.type=TreeClass.SPEC if lcaMap[child_c].is_root(): intern_lost.species=",".join(lcaMap[child_c].get_leaf_names()) lcaMap.update({intern_lost:lcaMap[child_c]}) else: intern_lost.species=",".join(lcaMap[child_c].up.get_leaf_names()) lcaMap.update({intern_lost:lcaMap[child_c].up}) #change here to display a subtree and not a leaf with a lot of specie lostnode.species=",".join(set(lcaMap[intern_lost].get_leaf_names())-set(child_c.species.split(","))) lostnode.type=TreeClass.LOST child_c.detach() #print "***********************\n\n** node : ", node, "\n\n** child_c: ", child_c, "\n\n** child parent", child_c.up #node.remove_child(child_c) intern_lost.add_child(child=lostnode) intern_lost.add_child(child=child_c) child_c=intern_lost node.add_child(child_c) children_list.append(child_c) #Case of polytomie in species tree.... if not node.is_leaf(): specie_list = ",".join([",".join(lcaMap[child_c].get_leaf_names()) for child_c in node.get_children()]) child_specie_set=set(specie_list.split(",")) real_specie_list=set(lcaMap[node].get_leaf_names()) unadded_specie=real_specie_list-child_specie_set #print unadded_specie, child_specie_set, real_specie_list #print node.species if(unadded_specie): lostnode=TreeClass() lostnode.type=TreeClass.LOST lostnode.species=",".join(unadded_specie) node.add_child(lostnode)
def event_in_time(time, node, spnode): # time for an event if event_rate == 0.0: next_t = INF else: next_t = random.expovariate(event_rate) if next_t > time: # no event on branch # we can stop node.dist = time node.add_features(type=INF) else: eprob = random.random() node.dist = next_t if eprob < birth * 1.0 / event_rate: # birth ==> duplication event cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) # compute event on the remaining time cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) node.add_features(type=TreeClass.AD) ecounter['dup'] += 1 elif eprob < (birth + death) * 1.0 / event_rate: # death happen ==> loss node.add_features(type=TreeClass.LOST) map_to_spec[node] = spnode ecounter['loss'] += 1 died.add(node) else: # give gene to another species ==> transfer contemp_transfer_nodes = list( spnode.get_incomparable_list(timeconsistent=True, wtime=next_t)) if contemp_transfer_nodes and not ( ign_suc_trn and node.up and node.up.has_feature( 'type', name=TreeClass.TRANSFER)): cand_receiver = random.choice(contemp_transfer_nodes) node.add_features(type=TreeClass.TRANSFER) ecounter['transfer'] += 1 cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) cnode = TreeClass() node.add_child(cnode) cnode.add_features(transfered=True) transfered[cnode] = cand_receiver map_to_spec[cnode] = cand_receiver t = cand_receiver.brlen - spnode.brlen + time - next_t event_in_time(t, cnode, cand_receiver) else: # keep node as it is # so speciation node.add_features(type=INF) self.debug_msg("Could not perform transfer at node") self.debug_msg(node)
def sample_event_on_branches(self, time, spnode, birth, death, transfer, gnode=None, keeplosses=False, ign_suc_trn=False, ecounter={}): """Simulate a reconstructed birth death tree""" # we are going with a poisson process # so the rate of having an event is # just the sum of rate event_rate = float(birth + death + transfer) died = set() transfered = {} map_to_spec = {} # create starting node if one is not given if gnode is None: gnode = TreeClass() map_to_spec[gnode] = spnode def event_in_time(time, node, spnode): # time for an event if event_rate == 0.0: next_t = INF else: next_t = random.expovariate(event_rate) if next_t > time: # no event on branch # we can stop node.dist = time node.add_features(type=INF) else: eprob = random.random() node.dist = next_t if eprob < birth * 1.0 / event_rate: # birth ==> duplication event cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) # compute event on the remaining time cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) node.add_features(type=TreeClass.AD) ecounter['dup'] += 1 elif eprob < (birth + death) * 1.0 / event_rate: # death happen ==> loss node.add_features(type=TreeClass.LOST) map_to_spec[node] = spnode ecounter['loss'] += 1 died.add(node) else: # give gene to another species ==> transfer contemp_transfer_nodes = list( spnode.get_incomparable_list(timeconsistent=True, wtime=next_t)) if contemp_transfer_nodes and not ( ign_suc_trn and node.up and node.up.has_feature( 'type', name=TreeClass.TRANSFER)): cand_receiver = random.choice(contemp_transfer_nodes) node.add_features(type=TreeClass.TRANSFER) ecounter['transfer'] += 1 cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) cnode = TreeClass() node.add_child(cnode) cnode.add_features(transfered=True) transfered[cnode] = cand_receiver map_to_spec[cnode] = cand_receiver t = cand_receiver.brlen - spnode.brlen + time - next_t event_in_time(t, cnode, cand_receiver) else: # keep node as it is # so speciation node.add_features(type=INF) self.debug_msg("Could not perform transfer at node") self.debug_msg(node) event_in_time(time, gnode, spnode) if not keeplosses: leaves = set(gnode.get_leaves()) - died if len(leaves) == 0: gnode.add_features(type=TreeClass.LOST) died.add(gnode) else: gnode.prune(leaves) gnode.delete_single_child_internal() return gnode, died, transfered, map_to_spec
def sample_from_tree(self, sptree, birth, death, gain, **kwargs): """Sample a tree within another tree using the rate specified Note that a tree with all leaves being extinct can be returned by this function Use dlt_tree_from_sptree if you want to prevent this. """ # initialize gene tree sptree.compute_branches_length() sptree.label_internal_node() removeloss = kwargs.get("removeloss", True) disallow_suc_trn = kwargs.get("disallow_suc_trn", True) leave_names = kwargs.get("names_library", []) gtree = TreeClass() recon = {gtree: sptree} events = {gtree: "spec"} losses = set() transfers = {} true_event_counter = ddict(int) snode_counter = ddict(int) if not leave_names: leave_names = lambda sp, x: sp + "_" + str(x) name_counter = 0 def create_history(snode, gnode): if snode.is_leaf(): if isinstance(leave_names, list): n_encounter = name_counter / len(leave_names) gnode.name = leave_names[name_counter % len(leaves_name)] gnode.name += ("_" + gnode.name) * n_encounter else: snode_counter[snode.name] += 1 gnode.name = leave_names(snode.name, snode_counter[snode.name]) events[gnode] = "leaf" gnode.add_features(type=TreeClass.SPEC) else: for schild in snode.get_children(): # get branches event for branch (snode, schild) # during time = schild.dist recnode, died, transfered, smap = self.sample_event_on_branches( schild.dist, schild, birth, death, gain, keeplosses=(not removeloss), ign_suc_trn=disallow_suc_trn, ecounter=true_event_counter) gnode.add_child(recnode) # update ist of losses losses.update(died) transfers.update(transfered) recon.update(smap) next_cand = [] # then record reconciliation that happened #print recnode.get_ascii(attributes=[], show_internal=True) #print schild for node in recnode.traverse(): node.add_features(species=recon[node].name) if node.type == TreeClass.LOST: events[node] = "loss" # died at the start of coalescence losses.add(node) elif node.is_leaf(): node.add_features(type=TreeClass.SPEC) events[node] = "spec" next_cand.append(node) elif node.type == TreeClass.AD: events[node] = "dup" else: events[node] = "transfer" for new_node in next_cand: create_history(recon[new_node], new_node) # if no child for node then it is a loss if gnode.is_leaf(): losses.add(gnode) create_history(sptree, gtree) gtree.delete_single_child_internal(enable_root=True) if removeloss: for node in gtree.traverse(): if node in losses: node.delete() gtree.delete_single_child_internal(enable_root=True) remove_from_history = set(recon.keys()) - set(gtree.traverse()) for node in remove_from_history: del recon[node] if node in events.keys(): del events[node] if len(gtree) <= 1: raise TotalExtinction("All taxa are extinct.") return gtree, recon, events, true_event_counter, transfers
def birth_death_tree(self, birth, death, **kwargs): """ Returns a birth-death tree with birth rate specified by ``birth``, and death rate specified by ``death``, and edge lengths in continuous (real) units. You can pass supplemental argument: - ``nsize`` : total number of leaves before stopping - ``names_library`` : list of names for the leaves - ``max_time`` : maximum time for simulation - if nsize if not provided, the length of names_library will be used - if nsize is larger than ``names_library``, leaves name will be completed with random names in the following format : "T%d" (T1, T2, etc) - If `max_time` is given as a keyword argument, tree is grown for a maximum of ``max_time``. - if `removeloss` is given as argument (default True), extinct taxa are removed Under some conditions, it is possible for all lineages on a tree to go extinct. In this case, if the keyword argument ``repeat_until_success`` is |True| (default), then a new branching process is initiated. Otherwise a TotalExtinction error is raised. """ tree = TreeClass() tree.dist = 0.0 done = False removeloss = kwargs.get("removeloss", True) repeat_until_success = kwargs.get("repeat_until_success", True) names_library = kwargs.get("names_library", []) nsize = kwargs.get("nsize", len(names_library)) max_time = kwargs.get("max_time", None) pb_stop = FunctionSlot("birth death stopping") if nsize: pb_stop.add(stop_with_tree_size) if max_time: pb_stop.add(stop_with_max_time) if pb_stop.isEmpty() and self.stopcrit.isEmpty(): raise MissingParameterError( "Either specify a names_library, nsize, max_time or a stopping criterion" ) extra_param = {} for k, v in kwargs.items(): if k not in ['nsize', 'max_time', 'removeloss']: extra_param[k] = v extra_param['nsize'] = nsize extra_param['max_time'] = max_time # initialize tree tree = TreeClass() tree.dist = 0.0 #_LOG.debug("Will generate a tree with no more than %s leaves to get a tree of %s leaves" % (str(gsa_ntax), str(nsize))) leaf_nodes = tree.get_leaves() curr_num_leaves = len(leaf_nodes) total_time = 0 died = set([]) event_rate = float(birth + death) while True: # waiting time based on event_rate wtime = random.expovariate(event_rate) #_LOG.debug("Drew waiting time of %f from hazard parameter of %f" % (wtime, all_rates)) total_time += wtime for leaf in leaf_nodes: # extinct leaves cannot update their branches length if not leaf.has_feature('name', name=TreeClass.LOST): leaf.dist += wtime if not pb_stop.isEmpty(): for val in pb_stop.applyFunctions(tree, cur_time=total_time, cur_size=curr_num_leaves, **extra_param): done = done or val if not self.stopcrit.isEmpty(): for val in self.stopcrit.applyFunctions( tree, cur_time=total_time, cur_size=curr_num_leaves, **extra_param): done = done or val if done: break # if event occurs within time constraints if max_time is None or total_time <= max_time: # select node at random, then find chance it died or give birth (speciation) node = random.choice(leaf_nodes) eprob = random.random() leaf_nodes.remove(node) curr_num_leaves -= 1 if eprob < birth / event_rate: #_LOG.debug("Speciation") c1 = TreeClass() c2 = TreeClass() c1.dist = 0 c2.dist = 0 node.add_features(type=TreeClass.SPEC) node.add_child(c1) node.add_child(c2) leaf_nodes.append(c1) leaf_nodes.append(c2) curr_num_leaves += 2 else: #_LOG.debug("Extinction") if curr_num_leaves > 0: #_LOG.debug("Will delete " + str(id(nd)) + " with parent = " + str(id(nd.parent_node))) died.add(node) node.add_features(type=TreeClass.LOST) else: if not repeat_until_success: raise TotalExtinction( "All lineage went extinct, please retry") # Restart the simulation because the tree has gone extinct tree = TreeClass() leaf_nodes = tree.get_leaves() curr_num_leaves = 1 died = set([]) total_time = 0 # this will always hold true assert curr_num_leaves == len(leaf_nodes) if removeloss: leaves = set(tree.get_leaves()) - died tree.prune(leaves) tree.delete_single_child_internal(enable_root=True) leaf_nodes = tree.get_leaves() #wtime = random.expovariate(event_rate) leaf_compteur = 1 nlc = 0 for ind, node in enumerate(leaf_nodes): if not node.has_feature('type', name=TreeClass.LOST): #node.dist += wtime if nlc < len(names_library): node.name = names_library[nlc] nlc += 1 else: node.name = "T%d" % leaf_compteur leaf_compteur += 1 return tree
def pure_birth_tree(self, birth=1.0, **kwargs): """Generates a uniform-rate pure-birth process tree. You can pass supplemental argument: - ``nsize`` : total number of leaves before stopping - ``names_library`` : list of names for the leaves - ``max_time`` : maximum time for simulation - if nsize if not provided, the length of names_library will be used - if nsize is larger than ``names_library``, leaves name will be completed with random names in the following format : "T%d" (T1, T2, etc) and """ tree = TreeClass() tree.dist = 0.0 # time of waiting # compared to parent done = False tname = kwargs.get("names_library", []) nsize = kwargs.get("nsize", len(tname)) max_time = kwargs.get("max_time", None) pb_stop = FunctionSlot("Pure birth stopping") if nsize: pb_stop.add(stop_with_tree_size) if max_time: pb_stop.add(stop_with_max_time) if pb_stop.isEmpty() and self.stopcrit.isEmpty(): raise MissingParameterError( "Either specify a names_library, nsize, max_time or a stopping criterion" ) extra_param = {} for k, v in kwargs.items(): if k not in ['nsize', 'max_time', 'removeloss']: extra_param[k] = v extra_param['nsize'] = nsize extra_param['max_time'] = max_time # fill namespace to desired size total_time = 0 while True: # time before new node # given the probability of birth leaf_nodes = tree.get_leaves() wtime = random.expovariate(len(leaf_nodes) / birth) total_time += wtime for leaf in leaf_nodes: leaf.dist += wtime if not pb_stop.isEmpty(): for val in pb_stop.applyFunctions(tree, cur_time=total_time, cur_size=len(leaf_nodes), **extra_param): done = done or val if not self.stopcrit.isEmpty(): for val in self.stopcrit.applyFunctions( tree, cur_time=total_time, cur_size=len(leaf_nodes), **extra_param): done = done or val if done: break if max_time is None or total_time <= max_time: # now add new node to a random leaf node = random.choice(leaf_nodes) c1 = TreeClass() c2 = TreeClass() node.add_child(c1) node.add_child(c2) c1.dist = 0.0 c2.dist = 0.0 leaf_nodes = tree.get_leaves() leaf_compteur = 1 total_time += wtime for ind, node in enumerate(leaf_nodes): if ind < len(tname): node.name = tname[ind] else: node.name = "T%d" % leaf_compteur leaf_compteur += 1 return tree
def reconcile(genetree=None, lcaMap=None, lost=False, lost_label_fn=None): """Reconcile genetree topology to a specietree, using an adequate mapping obtained with lcaMapping. 'reconcile' will infer evolutionary events like gene lost, gene speciation and gene duplication with distinction between AD and NAD """ if (lcaMap is None or genetree is None): raise Exception("lcaMapping or genetree not found") else: lost_count = 1 for node in genetree.traverse("levelorder"): node.add_features(type=TreeClass.SPEC) node.add_features(dup=False) # print node.name , node.species, " and children name ", # node.get_children_name()," and children species ", # node.get_children_species() if (not node.is_leaf() and (lcaMap[node] == lcaMap[node.get_child_at(0)] or lcaMap[node] == lcaMap[node.get_child_at(1)])): node.dup = True node.type = TreeClass.AD # print "\n\nnode = ", node, "\n\nand children : ", # node.children if not (set(node.get_child_at(0).get_species()).intersection( set(node.get_child_at(1).get_species()))): node.type = TreeClass.NAD if (isinstance(lost, basestring) and lost.upper() == "YES") or lost == True: for node in genetree.traverse("postorder"): children_list = node.get_children() node_is_dup = (node.type == TreeClass.NAD or node.type == TreeClass.AD) for child_c in children_list: if ((node_is_dup and lcaMap[child_c] != lcaMap[node]) or (not node_is_dup and (lcaMap[child_c].up != lcaMap[node]))): while ((lcaMap[child_c].up != lcaMap[node] and node.type == TreeClass.SPEC) or (lcaMap[child_c] != lcaMap[node] and node.type != TreeClass.SPEC)): lostnode = TreeClass() intern_lost = TreeClass() intern_lost.add_features(type=TreeClass.SPEC) intern_lost.add_features(dup=False) if lcaMap[child_c].is_root(): intern_lost.species = ",".join( lcaMap[child_c].get_leaf_names()) lcaMap.update({intern_lost: lcaMap[child_c]}) else: intern_lost.species = ",".join( lcaMap[child_c].up.get_leaf_names()) lcaMap.update( {intern_lost: lcaMap[child_c].up}) # change here to display a subtree and not a leaf # with a lot of specie lostnode.species = ",".join( set(lcaMap[intern_lost].get_leaf_names()) - set(lcaMap[child_c].get_leaf_names())) splist = lostnode.species.split(',') if (len(splist) > 1): if lost_label_fn: lostnode.name = lost_label_fn(splist) else: lostnode.name = "lost_" + \ str(lost_count) + "_" + \ "|".join([s[0:3] for s in splist]) else: if lost_label_fn: lostnode.name = lost_label_fn( lostnode.species) else: lostnode.name = "lost_" + lostnode.species lostnode.add_features(type=TreeClass.LOST) lostnode.add_features(dup=False) lost_count += 1 child_c.detach() # print "***********************\n\n** node : ", node, "\n\n** child_c: ", child_c, "\n\n** child parent", child_c.up # node.remove_child(child_c) intern_lost.add_child(child=lostnode) intern_lost.add_child(child=child_c) child_c = intern_lost node.add_child(child_c) children_list.append(child_c) # Case of polytomie in species tree.... if not node.is_leaf(): specie_list = ",".join([ ",".join(lcaMap[child_c].get_leaf_names()) for child_c in node.get_children() ]) child_specie_set = set(specie_list.split(",")) real_specie_list = set(lcaMap[node].get_leaf_names()) unadded_specie = real_specie_list - child_specie_set # print unadded_specie, child_specie_set, real_specie_list # print node.species if (unadded_specie): lostnode = TreeClass() lostnode.add_features(type=TreeClass.LOST) lostnode.add_features(dup=False) lostnode.species = ",".join(unadded_specie) if (len(unadded_specie) > 1): lostnode.name = "lost_" + \ str(lost_count) + "_" + \ "|".join([s[0:3] for s in unadded_specie]) else: lostnode.name = "lost_" + lostnode.species lost_count += 1 node.add_child(lostnode) genetree.add_features(reconciled=True)
def pure_birth_tree(self, birth=1.0, **kwargs): """Generates a uniform-rate pure-birth process tree. You can pass supplemental argument: - ``nsize`` : total number of leaves before stopping - ``names_library`` : list of names for the leaves - ``max_time`` : maximum time for simulation - if nsize if not provided, the length of names_library will be used - if nsize is larger than ``names_library``, leaves name will be completed with random names in the following format : "T%d" (T1, T2, etc) and """ tree = TreeClass() tree.dist = 0.0 # time of waiting # compared to parent done = False tname = kwargs.get("names_library", []) nsize = kwargs.get("nsize", len(tname)) max_time = kwargs.get("max_time", None) pb_stop = FunctionSlot("Pure birth stopping") if nsize: pb_stop.add(stop_with_tree_size) if max_time: pb_stop.add(stop_with_max_time) if pb_stop.isEmpty() and self.stopcrit.isEmpty(): raise MissingParameterError( "Either specify a names_library, nsize, max_time or a stopping criterion") extra_param = {} for k, v in kwargs.items(): if k not in ['nsize', 'max_time', 'removeloss']: extra_param[k] = v extra_param['nsize'] = nsize extra_param['max_time'] = max_time # fill namespace to desired size total_time = 0 while True: # time before new node # given the probability of birth leaf_nodes = tree.get_leaves() wtime = random.expovariate(len(leaf_nodes) / birth) total_time += wtime for leaf in leaf_nodes: leaf.dist += wtime if not pb_stop.isEmpty(): for val in pb_stop.applyFunctions(tree, cur_time=total_time, cur_size=len(leaf_nodes), **extra_param): done = done or val if not self.stopcrit.isEmpty(): for val in self.stopcrit.applyFunctions(tree, cur_time=total_time, cur_size=len(leaf_nodes), **extra_param): done = done or val if done: break if max_time is None or total_time <= max_time: # now add new node to a random leaf node = random.choice(leaf_nodes) c1 = TreeClass() c2 = TreeClass() node.add_child(c1) node.add_child(c2) c1.dist = 0.0 c2.dist = 0.0 leaf_nodes = tree.get_leaves() leaf_compteur = 1 total_time += wtime for ind, node in enumerate(leaf_nodes): if ind < len(tname): node.name = tname[ind] else: node.name = "T%d" % leaf_compteur leaf_compteur += 1 return tree
def event_in_time(time, node, spnode): # time for an event if event_rate == 0.0: next_t = INF else: next_t = random.expovariate(event_rate) if next_t > time: # no event on branch # we can stop node.dist = time node.add_features(type=INF) else: eprob = random.random() node.dist = next_t if eprob < birth * 1.0 / event_rate: # birth ==> duplication event cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) # compute event on the remaining time cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) node.add_features(type=TreeClass.AD) ecounter['dup'] += 1 elif eprob < (birth + death) * 1.0 / event_rate: # death happen ==> loss node.add_features(type=TreeClass.LOST) map_to_spec[node] = spnode ecounter['loss'] += 1 died.add(node) else: # give gene to another species ==> transfer contemp_transfer_nodes = list( spnode.get_incomparable_list(timeconsistent=True, wtime=next_t)) if contemp_transfer_nodes and not(ign_suc_trn and node.up and node.up.has_feature('type', name=TreeClass.TRANSFER)): cand_receiver = random.choice(contemp_transfer_nodes) node.add_features(type=TreeClass.TRANSFER) ecounter['transfer'] += 1 cnode = TreeClass() node.add_child(cnode) map_to_spec[cnode] = spnode event_in_time(time - next_t, cnode, spnode) cnode = TreeClass() node.add_child(cnode) cnode.add_features(transfered=True) transfered[cnode] = cand_receiver map_to_spec[cnode] = cand_receiver t = cand_receiver.brlen - spnode.brlen + time - next_t event_in_time(t, cnode, cand_receiver) else: # keep node as it is # so speciation node.add_features(type=INF) self.debug_msg("Could not perform transfer at node") self.debug_msg(node)
def polySolverPreprocessing(genetree, specietree, distance_mat, capitalize=False, gene_sep=None, specie_pos="postfix", nFlagVal=1e305, nFlag=False, smap=None, errorproof=False): """Preprocess genetree for polytomysolver """ # genetree input speciemap = None if isinstance(genetree, basestring) and not smap: genetree, gene_sep = newickPreprocessing(genetree, gene_sep) genetree = TreeClass(genetree) elif smap: if isinstance(smap, dict): speciemap = smap else: genetree = TreeClass(genetree) if isinstance( genetree, basestring) else genetree regexmap = {} speciemap = {} with open(smap, 'rU') if isinstance(smap, basestring) else smap as INPUT: for line in INPUT: g, s = line.strip().split() if ('*') in g and '.*' not in g: g = g.replace('*', '.*') g_regex = re.compile(g, re.IGNORECASE) regexmap[g_regex] = s for leaf in genetree: for key, value in regexmap.iteritems(): if key.match(leaf.name): speciemap[leaf.name] = value genetree.set_species( speciesMap=speciemap, sep=gene_sep, capitalize=capitalize, pos=specie_pos) # genetree check if len(genetree) != len(set(genetree.get_leaf_names())): tmp_leaf_name = genetree.get_leaf_names() duplicates = set( [x for x in tmp_leaf_name if tmp_leaf_name.count(x) > 1]) raise ValueError( "Your polytomy contains the following gene multiple times : %s" % ", ".join(duplicates)) # specietree input if isinstance(specietree, basestring): specietree, sep = newickPreprocessing(specietree, '') specietree = TreeClass(specietree) specietree.label_internal_node() # distance matrice input if(distance_mat): if isinstance(distance_mat, basestring): gene_matrix, node_order = clu.distMatProcessor( distance_mat, nFlagVal, nFlag) else: # distance mat is provided as a boolean # in that case, just try to get it from the genetree gene_matrix, node_order = get_distance_from_tree(genetree) # Difference check 1 # pos = node_order.index('ENSDORP00000008194_dordii') # print node_order # print gene_matrix[pos, :] listerr = set(node_order).symmetric_difference( set(genetree.get_leaf_names())) if listerr: if not errorproof: raise ValueError( "Different genes in distance matrix and genetree\n : See symmetric difference : %s\n" % ", ".join(listerr)) else: if gene_sep: resetNodeName(genetree, gene_sep, specie_pos == 'postfix') else: exib1 = set(node_order).difference( set(genetree.get_leaf_names())) exib2 = set(genetree.get_leaf_names() ).difference(set(node_order)) if exib2: raise Exception( 'Genes in trees and not in matrix : %s' % (exib2)) elif exib1: print("Genes in matrix and not in tree : %s \nAttempt to correct distance matrix" % ( ", ".join(exib1))) for l in exib1: try: lpos = node_order.index(l) gene_matrix = clu.remove_ij( gene_matrix, lpos, lpos) del node_order[lpos] except: raise IndexError( "Could not remove gene %s from distance matrix" % l) else: # This is for debug, will never happen raise ValueError( "distance matrix not provided and could not be infered from tree") # gene_matrix = clu.makeFakeDstMatrice(len(node_order), 0, 1) # Find list of species in genetree but not in specietree specieGeneList = set(genetree.get_leaf_species()) specieList = set([x.name for x in specietree.get_leaves()]) if(specieGeneList - specieList): if len(specieGeneList.intersection(specieList)) == 0 and gene_sep: raise Exception( "*** You probably didn't set the correct species position for you input tree !!") raise Exception("Species in genetree but not in specietree : %s" % ( ", ".join(specieGeneList - specieList))) return genetree, specietree, gene_matrix, node_order
def sample_from_tree(self, sptree, birth, death, gain, **kwargs): """Sample a tree within another tree using the rate specified Note that a tree with all leaves being extinct can be returned by this function Use dlt_tree_from_sptree if you want to prevent this. """ # initialize gene tree sptree.compute_branches_length() sptree.label_internal_node() removeloss = kwargs.get("removeloss", True) disallow_suc_trn = kwargs.get("disallow_suc_trn", True) leave_names = kwargs.get("names_library", []) gtree = TreeClass() recon = {gtree: sptree} events = {gtree: "spec"} losses = set() transfers = {} true_event_counter = ddict(int) snode_counter = ddict(int) if not leave_names: leave_names = lambda sp, x: sp + "_" + str(x) name_counter = 0 def create_history(snode, gnode): if snode.is_leaf(): if isinstance(leave_names, list): n_encounter = name_counter / len(leave_names) gnode.name = leave_names[name_counter % len(leaves_name)] gnode.name += ("_" + gnode.name) * n_encounter else: snode_counter[snode.name] += 1 gnode.name = leave_names( snode.name, snode_counter[snode.name]) events[gnode] = "leaf" gnode.add_features(type=TreeClass.SPEC) else: for schild in snode.get_children(): # get branches event for branch (snode, schild) # during time = schild.dist recnode, died, transfered, smap = self.sample_event_on_branches(schild.dist, schild, birth, death, gain, keeplosses=(not removeloss), ign_suc_trn=disallow_suc_trn, ecounter=true_event_counter) gnode.add_child(recnode) # update ist of losses losses.update(died) transfers.update(transfered) recon.update(smap) next_cand = [] # then record reconciliation that happened # print recnode.get_ascii(attributes=[], show_internal=True) # print schild for node in recnode.traverse(): node.add_features(species=recon[node].name) if node.type == TreeClass.LOST: events[node] = "loss" # died at the start of coalescence losses.add(node) elif node.is_leaf(): node.add_features(type=TreeClass.SPEC) events[node] = "spec" next_cand.append(node) elif node.type == TreeClass.AD: events[node] = "dup" else: events[node] = "transfer" for new_node in next_cand: create_history(recon[new_node], new_node) # if no child for node then it is a loss if gnode.is_leaf(): losses.add(gnode) create_history(sptree, gtree) gtree.delete_single_child_internal(enable_root=True) if removeloss: for node in gtree.traverse(): if node in losses: node.delete() gtree.delete_single_child_internal(enable_root=True) remove_from_history = set(recon.keys()) - set(gtree.traverse()) for node in remove_from_history: del recon[node] if node in events.keys(): del events[node] if len(gtree) <= 1: raise TotalExtinction("All taxa are extinct.") return gtree, recon, events, true_event_counter, transfers
def birth_death_tree(self, birth, death, **kwargs): """ Returns a birth-death tree with birth rate specified by ``birth``, and death rate specified by ``death``, and edge lengths in continuous (real) units. You can pass supplemental argument: - ``nsize`` : total number of leaves before stopping - ``names_library`` : list of names for the leaves - ``max_time`` : maximum time for simulation - if nsize if not provided, the length of names_library will be used - if nsize is larger than ``names_library``, leaves name will be completed with random names in the following format : "T%d" (T1, T2, etc) - If `max_time` is given as a keyword argument, tree is grown for a maximum of ``max_time``. - if `removeloss` is given as argument (default True), extinct taxa are removed Under some conditions, it is possible for all lineages on a tree to go extinct. In this case, if the keyword argument ``repeat_until_success`` is |True| (default), then a new branching process is initiated. Otherwise a TotalExtinction error is raised. """ tree = TreeClass() tree.dist = 0.0 done = False removeloss = kwargs.get("removeloss", True) repeat_until_success = kwargs.get("repeat_until_success", True) names_library = kwargs.get("names_library", []) nsize = kwargs.get("nsize", len(names_library)) max_time = kwargs.get("max_time", None) pb_stop = FunctionSlot("birth death stopping") if nsize: pb_stop.add(stop_with_tree_size) if max_time: pb_stop.add(stop_with_max_time) if pb_stop.isEmpty() and self.stopcrit.isEmpty(): raise MissingParameterError( "Either specify a names_library, nsize, max_time or a stopping criterion") extra_param = {} for k, v in kwargs.items(): if k not in ['nsize', 'max_time', 'removeloss']: extra_param[k] = v extra_param['nsize'] = nsize extra_param['max_time'] = max_time # initialize tree tree = TreeClass() tree.dist = 0.0 # LOG.debug("Will generate a tree with no more than %s leaves to get a tree of %s leaves" % (str(gsa_ntax), str(nsize))) leaf_nodes = tree.get_leaves() curr_num_leaves = len(leaf_nodes) total_time = 0 died = set([]) event_rate = float(birth + death) while True: # waiting time based on event_rate wtime = random.expovariate(event_rate) # _LOG.debug("Drew waiting time of %f from hazard parameter of %f" % (wtime, all_rates)) total_time += wtime for leaf in leaf_nodes: # extinct leaves cannot update their branches length if not leaf.has_feature('name', name=TreeClass.LOST): leaf.dist += wtime if not pb_stop.isEmpty(): for val in pb_stop.applyFunctions(tree, cur_time=total_time, cur_size=curr_num_leaves, **extra_param): done = done or val if not self.stopcrit.isEmpty(): for val in self.stopcrit.applyFunctions(tree, cur_time=total_time, cur_size=curr_num_leaves, **extra_param): done = done or val if done: break # if event occurs within time constraints if max_time is None or total_time <= max_time: # select node at random, then find chance it died or give birth # (speciation) node = random.choice(leaf_nodes) eprob = random.random() leaf_nodes.remove(node) curr_num_leaves -= 1 if eprob < birth / event_rate: # LOG.debug("Speciation") c1 = TreeClass() c2 = TreeClass() c1.dist = 0 c2.dist = 0 node.add_features(type=TreeClass.SPEC) node.add_child(c1) node.add_child(c2) leaf_nodes.append(c1) leaf_nodes.append(c2) curr_num_leaves += 2 else: # LOG.debug("Extinction") if curr_num_leaves > 0: # LOG.debug("Will delete " + str(id(nd)) + " with parent = " + str(id(nd.parent_node))) died.add(node) node.add_features(type=TreeClass.LOST) else: if not repeat_until_success: raise TotalExtinction( "All lineage went extinct, please retry") # Restart the simulation because the tree has gone # extinct tree = TreeClass() leaf_nodes = tree.get_leaves() curr_num_leaves = 1 died = set([]) total_time = 0 # this will always hold true assert curr_num_leaves == len(leaf_nodes) if removeloss: leaves = set(tree.get_leaves()) - died tree.prune(leaves) tree.delete_single_child_internal(enable_root=True) leaf_nodes = tree.get_leaves() # wtime = random.expovariate(event_rate) leaf_compteur = 1 nlc = 0 for ind, node in enumerate(leaf_nodes): if not node.has_feature('type', name=TreeClass.LOST): # node.dist += wtime if nlc < len(names_library): node.name = names_library[nlc] nlc += 1 else: node.name = "T%d" % leaf_compteur leaf_compteur += 1 return tree