Exemple #1
0
def makeRandomTree(names=list(string.lowercase), contract_seuil=0, feature_to_contract='support', random_branches=False):
    """Make a random Gene Tree"""
    tree = TreeClass()
    tree.populate(
        len(names), names_library=names, random_branches=random_branches)
    tree.contract_tree(seuil=contract_seuil, feature=feature_to_contract)
    return tree
def executePipe(tree, nxsfile=None, fasta=None, al=0, type=None, treefile=None):

	n=[]
	for leaf in tree:
		if(len(n)<7):
			n.append(leaf.name)

	tree.prune(n)

	if(treefile is not None):
		tree=TreeClass(treefile)
	else:
		try:
			tree.write(format=0, outfile="tree.nw");
			treefile="tree.nw"
		except Exception as e:
			print e
			print "Can't write tree to 'tree.nw'"

	if not isinstance(tree, TreeClass):
		raise ValueError ("You sould use a TreeNode instance")

	if(nxsfile is None):
		if fasta is None:
			print
			print "WRITING your sequence into a fasta file"
			tree.writeSeqToFasta(comment=0)
			fasta="seq.fasta"

		nxsfile=write_al_in_nxs_file(fasta, al=al)

	executePhyML(nxsfile, treefile)
Exemple #3
0
def getRFval(refTree_path, tree_path, unroot=False):
    refTree = TreeClass(refTree_path)
    tree = TreeClass(tree_path)
    if(unroot):
        refTree.unroot()
    rf, max_rf, c, p1, p2 = refTree.robinson_foulds(
        tree, unrooted_trees=unroot)
    return rf, max_rf
def fetch_ensembl_genetree_by_member(memberID=None,
                                     species=None,
                                     id_type=None,
                                     output="nh",
                                     nh_format="full"):
    """Fetch genetree from a member ID
    :argument memberID: the ensembl gene ID member of the tree to fetch, this is mandatory! EX: ENSG00000157764
    :argument species: Registry name/aliases used to restrict searches by. Only required if a stable ID is not unique to a species (not the case with Ensembl databases) EX: human, homo_sapiens
    :argument id_type: Object type to restrict searches to. Used when a stable ID is not unique to a single class. EX: gene, transcript
    :argument output: nh / phyloxml, type of output we are looking for!
    :argument nh_format: full / display_label_composite / simple / species / species_short_name / ncbi_taxon / ncbi_name / njtree / phylip, The format of the nh output, only useful when the output is set to nh
    """
    if not memberID:
        raise valueError('Please provide a genetree id')
    else:
        http = httplib2.Http(".cache")
        server = "http://rest.ensembl.org"
        ext = "/genetree/member/id/%s?" % (memberID)
        if species:
            ext = ext + "species=" + species + ";"
        if id_type:
            ext = ext + "object_type=" + id_type + ";"
        if (output == "nh"):
            ext = ext + "nh_format=%s;" % nh_format
        output = "text/x-" + output
        resp, content = http.request(server + ext,
                                     method="GET",
                                     headers={"Content-Type": output})
        if not resp.status == 200:
            print("Invalid response: ", resp.status)
            raise ValueError('Failled to process request!')
        if (output.lower() != "text/x-phyloxml"):
            return TreeClass(content)
        else:
            return getTreeFromPhyloxml(content)
def fetch_ensembl_genetree_by_id(treeID=None,
                                 aligned=0,
                                 sequence="none",
                                 output="nh",
                                 nh_format="full"):
    """Fetch genetree from ensembl tree ID
    :argument treeID: the ensembl tree ID, this is mandatory
    :argument aligned: boolean (0/1), used with sequence to retrieve aligned sequence
    :argument sequence: none / protein /cdna /gene, should we retrieve sequence also?, work only with phyloxml nh_format
    :argument output: nh / phyloxml, type of output we are looking for!
    :argument nh_format: full / display_label_composite / simple / species / species_short_name / ncbi_taxon / ncbi_name / njtree / phylip, The format of the nh output, only useful when the output is set to nh
    """
    if not treeID:
        raise valueError('Please provide a genetree id')
    else:
        #http = httplib2.Http(".cache")
        server = "http://rest.ensembl.org"
        ext = "/genetree/id/%s?sequence=%s;aligned=%i" % (treeID, sequence,
                                                          aligned)
        if (output == "nh"):
            ext = ext + ";nh_format=%s" % nh_format
        output = "text/x-" + output
        request = urllib2.Request(server + ext,
                                  headers={"Content-Type": output})
        resp = urllib2.urlopen(request)
        content = resp.read()
        #resp, content = http.request(server+ext, method="GET", headers={"Content-Type":output})
        if not resp.getcode() == 200:
            print("Invalid response: ", resp.getcode())
            raise ValueError('Failled to process request!')

        if (output.lower() != "text/x-phyloxml"):
            return TreeClass(content)
        else:
            return getTreeFromPhyloxml(content)
Exemple #6
0
def condense_node_order(matrice, smallest_index, node_order, method='upgma'):
    """
    condenses two nodes in node_order based on smallest_index info
    This function is used to create a tree while condensing a matrice
    with the condense_matrix function. The smallest_index is retrieved
    with find_smallest_index. The first index is replaced with a node object
    that combines the two nodes corresponding to the indices in node order.
    The second index in smallest_index is replaced with None.
    Also sets the branch length of the nodes to 1/2 of the distance between
    the nodes in the matrice"""
    index1, index2 = smallest_index
    node1 = node_order[index1]
    node2 = node_order[index2]
    # get the distance between the nodes and assign 1/2 the distance to the
    # Length property of each node

    if(method.lower() == 'nj'):
        dist = paired_node_distance(matrice, smallest_index)

    elif(method.lower() == 'upgma'):
        distance = matrice[index1, index2]
        dist = (distance / 2.0, distance / 2.0)

    else:
        dist = (0, 0)

    nodes = [node1, node2]
    pos = [0, 1]

    for ind in pos:
        nodes[ind].add_features(length=dist[ind])
    # combine the two nodes into a new TreeNode object
    new_node = TreeClass()
    new_node.add_child(node1)
    new_node.add_child(node2)
    new_node.add_features(length=sum(dist))
    # replace the object at index1 with the combined node
    node_order[index2] = new_node
    # replace the object at index2 with None
    del node_order[index1]  # distance at i=index2 || j=index2
    return node_order
Exemple #7
0
def makeRandomTree(names=list(string.lowercase), contract_seuil=0, feature_to_contract='support', random_branches=False):
    """Make a random Gene Tree"""
    tree = TreeClass()
    tree.populate(
        len(names), names_library=names, random_branches=random_branches)
    tree.contract_tree(seuil=contract_seuil, feature=feature_to_contract)
    return tree
Exemple #8
0
def retrieveDupAndLostCost(treefile, streefile, smap, sep=None, pos='prefix'):

    genetree = TreeClass(treefile)
    specietree = TreeClass(streefile)
    regexmap = {}
    speciemap = {}
    with open(smap, 'rU') if isinstance(smap, basestring) else smap as INPUT:
        for line in INPUT:
            g, s = line.strip().split()
            g_regex = re.compile(g.replace('*', '.*'))
            regexmap[g_regex] = s

    for leaf in genetree:
        for key, value in regexmap.iteritems():
            if key.match(leaf.name):
                speciemap[leaf.name] = value

    genetree.set_species(speciesMap=speciemap, sep=sep, pos=pos)
    lcamap = TreeUtils.lcaMapping(genetree, specietree)
    TreeUtils.reconcile(genetree, lcaMap=lcamap, lost="yes")
    # print genetree.get_ascii(show_internal=True, attributes=['name', 'type'])
    return TreeUtils.computeDLScore(genetree)
Exemple #9
0
def retrieveDupAndLostCost(treefile, streefile, smap, sep=None, pos='prefix'):

    genetree = TreeClass(treefile)
    specietree = TreeClass(streefile)
    regexmap = {}
    speciemap = {}
    with open(smap, 'rU') if isinstance(smap, basestring) else smap as INPUT:
        for line in INPUT:
            g, s = line.strip().split()
            g_regex = re.compile(g.replace('*', '.*'))
            regexmap[g_regex] = s

    for leaf in genetree:
        for key, value in regexmap.iteritems():
            if key.match(leaf.name):
                speciemap[leaf.name] = value

    genetree.set_species(speciesMap=speciemap, sep=sep, pos=pos)
    lcamap = TreeUtils.lcaMapping(genetree, specietree)
    TreeUtils.reconcile(genetree, lcaMap=lcamap, lost="yes")
    # print genetree.get_ascii(show_internal=True, attributes=['name', 'type'])
    return TreeUtils.computeDLScore(genetree)
Exemple #10
0
def getRFval(refTree_path, tree_path, unroot=False):
    refTree = TreeClass(refTree_path)
    tree = TreeClass(tree_path)
    if (unroot):
        refTree.unroot()
    rf, max_rf, c, p1, p2 = refTree.robinson_foulds(tree,
                                                    unrooted_trees=unroot)
    return rf, max_rf
def polySolverPreprocessing(genetree, specietree, distance_file, capitalize=False, gene_sep = None, specie_pos="postfix", dist_diagonal=1e305, nFlag=False):
	#################################################################
	#TODO :
	#	1) Correct newick
	#	2) Sequence retrieve
	#	3) PhyML to align sequence and make a distance matrice
	#
	#################################################################

	#genetree input
	if isinstance(genetree, basestring):
		genetree, gene_sep=newick_preprocessing(genetree, gene_sep)
		genetree= TreeClass(genetree)
	genetree.set_species(sep=gene_sep, capitalize=capitalize, pos=specie_pos)

	#specietree input
	if isinstance(specietree, basestring):
		specietree, sep=newick_preprocessing(specietree, '')
		specietree= TreeClass(specietree)
		specietree.label_internal_node()

	#distance matrice input
	if(distance_file):
		gene_matrix, node_order= clu.distMatProcessor(distance_file, dist_diagonal, nFlag)
		#Difference check 1
		if set(node_order).difference(set(genetree.get_leaf_names())):
			reset_node_name(genetree, gene_sep)
	else:
		node_order= genetree.get_leaf_names()
		gene_matrix= clu.makeFakeDstMatrice(len(node_order), 0, 1, dist_diagonal) #Alternative, retrieve aligned sequence and run phyML

	#Find list of species not in genetree
	specieGeneList= set(genetree.get_leaf_species())
	specieList= set([x.name for x in specietree.get_leaves()])
	if(specieGeneList-specieList):
		raise Exception("Species in genetree but not in specietree : %s" %(", ".join(specieGeneList-specieList)))

	return genetree, specietree, gene_matrix, node_order
def getTreeFromPhyloxml(xml, saveToFile="default.xml", delFile=True):
    """
    Read a phylogeny tree from a phyloxml string and return a TreeClass object
    or a list of TreeClass object
    """
    project = Phyloxml()
    fo = open(saveToFile, "w+")
    fo.write(xml)
    fo.close()
    project.build_from_file(saveToFile)
    treeList = []
    for tree in project.get_phylogeny():
        treeList.append(TreeClass.import_from_PhyloxmlTree(tree))

    if (delFile):
        os.remove(saveToFile)
    if len(treeList) == 1:
        return treeList[0]
    return treeList
def getTreeFromPhyloxml(xml, saveToFile="default.xml", delFile=True):
	"""
	Read a phylogeny tree from a phyloxml string and return a TreeClass object
	or a list of TreeClass object
	"""
	project = Phyloxml()
	fo=open(saveToFile, "w+")
	fo.write(xml)
	fo.close()
	project.build_from_file(saveToFile)
	treeList=[]
	for tree in project.get_phylogeny():
		treeList.append(TreeClass.import_from_PhyloxmlTree(tree))

	if(delFile):
		os.remove(saveToFile)
	if len(treeList)==1:
		return treeList[0]
	return treeList
def condense_node_order(matrice, smallest_index, node_order, method='upgma'):
    """
    condenses two nodes in node_order based on smallest_index info
    This function is used to create a tree while condensing a matrice
    with the condense_matrix function. The smallest_index is retrieved
    with find_smallest_index. The first index is replaced with a node object
    that combines the two nodes corresponding to the indices in node order.
    The second index in smallest_index is replaced with None.
    Also sets the branch length of the nodes to 1/2 of the distance between
    the nodes in the matrice"""
    index1, index2 = smallest_index
    node1 = node_order[index1]
    node2 = node_order[index2]
    # get the distance between the nodes and assign 1/2 the distance to the
    # Length property of each node

    if (method.lower() == 'nj'):
        dist = paired_node_distance(matrice, smallest_index)

    elif (method.lower() == 'upgma'):
        distance = matrice[index1, index2]
        dist = (distance / 2.0, distance / 2.0)

    else:
        dist = (0, 0)

    nodes = [node1, node2]
    pos = [0, 1]

    for ind in pos:
        nodes[ind].add_features(length=dist[ind])
    # combine the two nodes into a new TreeNode object
    new_node = TreeClass()
    new_node.add_child(node1)
    new_node.add_child(node2)
    new_node.add_features(length=sum(dist))
    # replace the object at index1 with the combined node
    node_order[index2] = new_node
    # replace the object at index2 with None
    del node_order[index1]  # distance at i=index2 || j=index2
    return node_order
Exemple #15
0
    def sample_event_on_branches(self, time, spnode, birth, death, transfer, gnode=None, keeplosses=False, ign_suc_trn=False, ecounter={}):
        """Simulate a reconstructed birth death tree"""

        # we are going with a poisson process
        # so the rate of having an event is
        # just the sum of rate
        event_rate = float(birth + death + transfer)
        died = set()
        transfered = {}
        map_to_spec = {}
        # create starting node if one is not given
        if gnode is None:
            gnode = TreeClass()
            map_to_spec[gnode] = spnode

        def event_in_time(time, node, spnode):
            # time for an event
            if event_rate == 0.0:
                next_t = INF
            else:
                next_t = random.expovariate(event_rate)

            if next_t > time:
                # no event on branch
                # we can stop
                node.dist = time
                node.add_features(type=INF)

            else:
                eprob = random.random()
                node.dist = next_t
                if eprob < birth * 1.0 / event_rate:
                    # birth ==> duplication event
                    cnode = TreeClass()
                    node.add_child(cnode)
                    map_to_spec[cnode] = spnode
                    event_in_time(time - next_t, cnode, spnode)
                    # compute event on the remaining time
                    cnode = TreeClass()
                    node.add_child(cnode)
                    map_to_spec[cnode] = spnode
                    event_in_time(time - next_t, cnode, spnode)
                    node.add_features(type=TreeClass.AD)
                    ecounter['dup'] += 1

                elif eprob < (birth + death) * 1.0 / event_rate:
                    # death happen ==> loss
                    node.add_features(type=TreeClass.LOST)
                    map_to_spec[node] = spnode
                    ecounter['loss'] += 1
                    died.add(node)
                else:
                    # give gene to another species ==> transfer
                    contemp_transfer_nodes = list(
                        spnode.get_incomparable_list(timeconsistent=True, wtime=next_t))
                    if contemp_transfer_nodes and not(ign_suc_trn and node.up and node.up.has_feature('type', name=TreeClass.TRANSFER)):
                        cand_receiver = random.choice(contemp_transfer_nodes)

                        node.add_features(type=TreeClass.TRANSFER)
                        ecounter['transfer'] += 1
                        cnode = TreeClass()
                        node.add_child(cnode)
                        map_to_spec[cnode] = spnode
                        event_in_time(time - next_t, cnode, spnode)

                        cnode = TreeClass()
                        node.add_child(cnode)
                        cnode.add_features(transfered=True)
                        transfered[cnode] = cand_receiver
                        map_to_spec[cnode] = cand_receiver
                        t = cand_receiver.brlen - spnode.brlen + time - next_t
                        event_in_time(t, cnode, cand_receiver)
                    else:
                        # keep node as it is
                        # so speciation
                        node.add_features(type=INF)

                        self.debug_msg("Could not perform transfer at node")
                        self.debug_msg(node)

        event_in_time(time, gnode, spnode)

        if not keeplosses:
            leaves = set(gnode.get_leaves()) - died

            if len(leaves) == 0:
                gnode.add_features(type=TreeClass.LOST)
                died.add(gnode)
            else:
                gnode.prune(leaves)
            gnode.delete_single_child_internal()

        return gnode, died, transfered, map_to_spec
def polySolverPreprocessing(genetree,
                            specietree,
                            distance_mat,
                            capitalize=False,
                            gene_sep=None,
                            specie_pos="postfix",
                            nFlagVal=1e305,
                            nFlag=False,
                            smap=None,
                            errorproof=False):
    """Preprocess genetree for polytomysolver
    """

    # genetree input
    speciemap = None
    if isinstance(genetree, basestring) and not smap:
        genetree, gene_sep = newickPreprocessing(genetree, gene_sep)
        genetree = TreeClass(genetree)

    elif smap:
        if isinstance(smap, dict):
            speciemap = smap
        else:
            genetree = TreeClass(genetree) if isinstance(
                genetree, basestring) else genetree
            regexmap = {}
            speciemap = {}
            with open(smap, 'rU') if isinstance(smap,
                                                basestring) else smap as INPUT:
                for line in INPUT:
                    g, s = line.strip().split()
                    if ('*') in g and '.*' not in g:
                        g = g.replace('*', '.*')
                    g_regex = re.compile(g, re.IGNORECASE)
                    regexmap[g_regex] = s

            for leaf in genetree:
                for key, value in regexmap.iteritems():
                    if key.match(leaf.name):
                        speciemap[leaf.name] = value

    genetree.set_species(speciesMap=speciemap,
                         sep=gene_sep,
                         capitalize=capitalize,
                         pos=specie_pos)

    # genetree check
    if len(genetree) != len(set(genetree.get_leaf_names())):
        tmp_leaf_name = genetree.get_leaf_names()
        duplicates = set(
            [x for x in tmp_leaf_name if tmp_leaf_name.count(x) > 1])
        raise ValueError(
            "Your polytomy contains the following gene multiple times : %s" %
            ", ".join(duplicates))

    # specietree input
    if isinstance(specietree, basestring):
        specietree, sep = newickPreprocessing(specietree, '')
        specietree = TreeClass(specietree)
    specietree.label_internal_node()

    # distance matrice input
    if (distance_mat):
        if isinstance(distance_mat, basestring):
            gene_matrix, node_order = clu.distMatProcessor(
                distance_mat, nFlagVal, nFlag)
        else:
            # distance mat is provided as a boolean
            # in that case, just try to get it from the genetree
            gene_matrix, node_order = get_distance_from_tree(genetree)
        # Difference check 1
        #pos = node_order.index('ENSDORP00000008194_dordii')
        #print node_order
        #print gene_matrix[pos, :]
        listerr = set(node_order).symmetric_difference(
            set(genetree.get_leaf_names()))
        if listerr:
            if not errorproof:
                raise ValueError(
                    "Different genes in distance matrix and genetree\n : See symmetric difference : %s\n"
                    % ", ".join(listerr))
            else:
                if gene_sep:
                    resetNodeName(genetree, gene_sep, specie_pos == 'postfix')
                else:
                    exib1 = set(node_order).difference(
                        set(genetree.get_leaf_names()))
                    exib2 = set(genetree.get_leaf_names()).difference(
                        set(node_order))
                    if exib2:
                        raise Exception(
                            'Genes in trees and not in matrix : %s' % (exib2))
                    elif exib1:
                        print(
                            "Genes in matrix and not in tree : %s \nAttempt to correct distance matrix"
                            % (", ".join(exib1)))
                        for l in exib1:
                            try:
                                lpos = node_order.index(l)
                                gene_matrix = clu.remove_ij(
                                    gene_matrix, lpos, lpos)
                                del node_order[lpos]
                            except:
                                raise IndexError(
                                    "Could not remove gene %s from distance matrix"
                                    % l)

    else:
        # This is for debug, will never happen
        raise ValueError(
            "distance matrix not provided and could not be infered from tree")
        #gene_matrix = clu.makeFakeDstMatrice(len(node_order), 0, 1)

    # Find list of species in genetree but not in specietree
    specieGeneList = set(genetree.get_leaf_species())
    specieList = set([x.name for x in specietree.get_leaves()])
    if (specieGeneList - specieList):
        if len(specieGeneList.intersection(specieList)) == 0 and gene_sep:
            raise Exception(
                "*** You probably didn't set the correct species position for you input tree !!"
            )
        raise Exception("Species in genetree but not in specietree : %s" %
                        (", ".join(specieGeneList - specieList)))

    return genetree, specietree, gene_matrix, node_order
def reconcile(geneTree=None, lcaMap=None, lost="no"):
	"""Reconcile genetree topology to a specieTree, using an adequate mapping obtained with lcaMapping.
	'reconcile' will infer evolutionary events like gene lost, gene speciation and gene duplication with distinction between AD and NAD
	"""
	if(map is None or geneTree is None):
		raise Exception("lcaMapping or geneTree not found")
	else :
		for node in geneTree.traverse("levelorder"):
			node.add_features(type=TreeClass.SPEC)
			#print node.name , node.species, " and children name ", node.get_children_name()," and children species ", node.get_children_species()
			if(not node.is_leaf() and (lcaMap[node]==lcaMap[node.get_child_at(0)] or lcaMap[node]==lcaMap[node.get_child_at(1)])):
				node.type=TreeClass.AD
				#print "\n\nnode = ", node, "\n\nand children : ", node.children
				if not (set(node.get_child_at(0).get_species()).intersection(set(node.get_child_at(1).get_species()))):
					node.type=TreeClass.NAD

		if(lost.upper()=="YES"):
			for node in geneTree.traverse("postorder"):
				children_list=node.get_children()
				for child_c in children_list:
					if((lcaMap[child_c].up != lcaMap[node] and lcaMap[child_c] != lcaMap[node]) or (node.type==TreeClass.AD and lcaMap[node]!=lcaMap[child_c])):

						while((lcaMap[child_c].up!=lcaMap[node] and node.type==TreeClass.SPEC) or (lcaMap[child_c]!=lcaMap[node] and node.type!=TreeClass.SPEC)):
							lostnode=TreeClass()
							intern_lost=TreeClass()
							intern_lost.type=TreeClass.SPEC
							if lcaMap[child_c].is_root():
								intern_lost.species=",".join(lcaMap[child_c].get_leaf_names())
								lcaMap.update({intern_lost:lcaMap[child_c]})

							else:
								intern_lost.species=",".join(lcaMap[child_c].up.get_leaf_names())
								lcaMap.update({intern_lost:lcaMap[child_c].up})


							#change here to display a subtree and not a leaf with a lot of specie
							lostnode.species=",".join(set(lcaMap[intern_lost].get_leaf_names())-set(child_c.species.split(",")))
							lostnode.type=TreeClass.LOST
							child_c.detach()
							#print "***********************\n\n** node : ", node, "\n\n** child_c: ", child_c, "\n\n** child parent", child_c.up
							#node.remove_child(child_c)
							intern_lost.add_child(child=lostnode)
							intern_lost.add_child(child=child_c)
							child_c=intern_lost
						node.add_child(child_c)
						children_list.append(child_c)

				#Case of polytomie in species tree....
				if not node.is_leaf():
					specie_list = ",".join([",".join(lcaMap[child_c].get_leaf_names()) for child_c in node.get_children()])
					child_specie_set=set(specie_list.split(","))
					real_specie_list=set(lcaMap[node].get_leaf_names())
					unadded_specie=real_specie_list-child_specie_set
					#print unadded_specie, child_specie_set, real_specie_list
					#print node.species
					if(unadded_specie):
						lostnode=TreeClass()
						lostnode.type=TreeClass.LOST
						lostnode.species=",".join(unadded_specie)
						node.add_child(lostnode)
Exemple #18
0
        def event_in_time(time, node, spnode):
            # time for an event
            if event_rate == 0.0:
                next_t = INF
            else:
                next_t = random.expovariate(event_rate)

            if next_t > time:
                # no event on branch
                # we can stop
                node.dist = time
                node.add_features(type=INF)

            else:
                eprob = random.random()
                node.dist = next_t
                if eprob < birth * 1.0 / event_rate:
                    # birth ==> duplication event
                    cnode = TreeClass()
                    node.add_child(cnode)
                    map_to_spec[cnode] = spnode
                    event_in_time(time - next_t, cnode, spnode)
                    # compute event on the remaining time
                    cnode = TreeClass()
                    node.add_child(cnode)
                    map_to_spec[cnode] = spnode
                    event_in_time(time - next_t, cnode, spnode)
                    node.add_features(type=TreeClass.AD)
                    ecounter['dup'] += 1

                elif eprob < (birth + death) * 1.0 / event_rate:
                    # death happen ==> loss
                    node.add_features(type=TreeClass.LOST)
                    map_to_spec[node] = spnode
                    ecounter['loss'] += 1
                    died.add(node)
                else:
                    # give gene to another species ==> transfer
                    contemp_transfer_nodes = list(
                        spnode.get_incomparable_list(timeconsistent=True,
                                                     wtime=next_t))
                    if contemp_transfer_nodes and not (
                            ign_suc_trn and node.up and node.up.has_feature(
                                'type', name=TreeClass.TRANSFER)):
                        cand_receiver = random.choice(contemp_transfer_nodes)

                        node.add_features(type=TreeClass.TRANSFER)
                        ecounter['transfer'] += 1
                        cnode = TreeClass()
                        node.add_child(cnode)
                        map_to_spec[cnode] = spnode
                        event_in_time(time - next_t, cnode, spnode)

                        cnode = TreeClass()
                        node.add_child(cnode)
                        cnode.add_features(transfered=True)
                        transfered[cnode] = cand_receiver
                        map_to_spec[cnode] = cand_receiver
                        t = cand_receiver.brlen - spnode.brlen + time - next_t
                        event_in_time(t, cnode, cand_receiver)
                    else:
                        # keep node as it is
                        # so speciation
                        node.add_features(type=INF)

                        self.debug_msg("Could not perform transfer at node")
                        self.debug_msg(node)
Exemple #19
0
    def sample_event_on_branches(self,
                                 time,
                                 spnode,
                                 birth,
                                 death,
                                 transfer,
                                 gnode=None,
                                 keeplosses=False,
                                 ign_suc_trn=False,
                                 ecounter={}):
        """Simulate a reconstructed birth death tree"""

        # we are going with a poisson process
        # so the rate of having an event is
        # just the sum of rate
        event_rate = float(birth + death + transfer)
        died = set()
        transfered = {}
        map_to_spec = {}
        # create starting node if one is not given
        if gnode is None:
            gnode = TreeClass()
            map_to_spec[gnode] = spnode

        def event_in_time(time, node, spnode):
            # time for an event
            if event_rate == 0.0:
                next_t = INF
            else:
                next_t = random.expovariate(event_rate)

            if next_t > time:
                # no event on branch
                # we can stop
                node.dist = time
                node.add_features(type=INF)

            else:
                eprob = random.random()
                node.dist = next_t
                if eprob < birth * 1.0 / event_rate:
                    # birth ==> duplication event
                    cnode = TreeClass()
                    node.add_child(cnode)
                    map_to_spec[cnode] = spnode
                    event_in_time(time - next_t, cnode, spnode)
                    # compute event on the remaining time
                    cnode = TreeClass()
                    node.add_child(cnode)
                    map_to_spec[cnode] = spnode
                    event_in_time(time - next_t, cnode, spnode)
                    node.add_features(type=TreeClass.AD)
                    ecounter['dup'] += 1

                elif eprob < (birth + death) * 1.0 / event_rate:
                    # death happen ==> loss
                    node.add_features(type=TreeClass.LOST)
                    map_to_spec[node] = spnode
                    ecounter['loss'] += 1
                    died.add(node)
                else:
                    # give gene to another species ==> transfer
                    contemp_transfer_nodes = list(
                        spnode.get_incomparable_list(timeconsistent=True,
                                                     wtime=next_t))
                    if contemp_transfer_nodes and not (
                            ign_suc_trn and node.up and node.up.has_feature(
                                'type', name=TreeClass.TRANSFER)):
                        cand_receiver = random.choice(contemp_transfer_nodes)

                        node.add_features(type=TreeClass.TRANSFER)
                        ecounter['transfer'] += 1
                        cnode = TreeClass()
                        node.add_child(cnode)
                        map_to_spec[cnode] = spnode
                        event_in_time(time - next_t, cnode, spnode)

                        cnode = TreeClass()
                        node.add_child(cnode)
                        cnode.add_features(transfered=True)
                        transfered[cnode] = cand_receiver
                        map_to_spec[cnode] = cand_receiver
                        t = cand_receiver.brlen - spnode.brlen + time - next_t
                        event_in_time(t, cnode, cand_receiver)
                    else:
                        # keep node as it is
                        # so speciation
                        node.add_features(type=INF)

                        self.debug_msg("Could not perform transfer at node")
                        self.debug_msg(node)

        event_in_time(time, gnode, spnode)

        if not keeplosses:
            leaves = set(gnode.get_leaves()) - died

            if len(leaves) == 0:
                gnode.add_features(type=TreeClass.LOST)
                died.add(gnode)
            else:
                gnode.prune(leaves)
            gnode.delete_single_child_internal()

        return gnode, died, transfered, map_to_spec
Exemple #20
0
    def sample_from_tree(self, sptree, birth, death, gain, **kwargs):
        """Sample a tree within another tree using the rate specified 
            Note that a tree with all leaves being extinct can be returned by this function
            Use dlt_tree_from_sptree if you want to prevent this.
        """

        # initialize gene tree
        sptree.compute_branches_length()
        sptree.label_internal_node()
        removeloss = kwargs.get("removeloss", True)
        disallow_suc_trn = kwargs.get("disallow_suc_trn", True)
        leave_names = kwargs.get("names_library", [])

        gtree = TreeClass()
        recon = {gtree: sptree}
        events = {gtree: "spec"}
        losses = set()
        transfers = {}
        true_event_counter = ddict(int)
        snode_counter = ddict(int)
        if not leave_names:
            leave_names = lambda sp, x: sp + "_" + str(x)
        name_counter = 0

        def create_history(snode, gnode):
            if snode.is_leaf():
                if isinstance(leave_names, list):
                    n_encounter = name_counter / len(leave_names)
                    gnode.name = leave_names[name_counter % len(leaves_name)]
                    gnode.name += ("_" + gnode.name) * n_encounter
                else:
                    snode_counter[snode.name] += 1
                    gnode.name = leave_names(snode.name,
                                             snode_counter[snode.name])
                events[gnode] = "leaf"
                gnode.add_features(type=TreeClass.SPEC)
            else:
                for schild in snode.get_children():
                    # get branches event for branch (snode, schild)
                    # during time = schild.dist
                    recnode, died, transfered, smap = self.sample_event_on_branches(
                        schild.dist,
                        schild,
                        birth,
                        death,
                        gain,
                        keeplosses=(not removeloss),
                        ign_suc_trn=disallow_suc_trn,
                        ecounter=true_event_counter)
                    gnode.add_child(recnode)
                    # update ist of losses
                    losses.update(died)
                    transfers.update(transfered)
                    recon.update(smap)
                    next_cand = []
                    # then record reconciliation that happened
                    #print recnode.get_ascii(attributes=[], show_internal=True)
                    #print schild
                    for node in recnode.traverse():
                        node.add_features(species=recon[node].name)
                        if node.type == TreeClass.LOST:
                            events[node] = "loss"
                            # died at the start of coalescence
                            losses.add(node)
                        elif node.is_leaf():
                            node.add_features(type=TreeClass.SPEC)
                            events[node] = "spec"
                            next_cand.append(node)
                        elif node.type == TreeClass.AD:
                            events[node] = "dup"
                        else:
                            events[node] = "transfer"

                    for new_node in next_cand:
                        create_history(recon[new_node], new_node)

                # if no child for node then it is a loss
                if gnode.is_leaf():

                    losses.add(gnode)

        create_history(sptree, gtree)

        gtree.delete_single_child_internal(enable_root=True)
        if removeloss:
            for node in gtree.traverse():
                if node in losses:
                    node.delete()
            gtree.delete_single_child_internal(enable_root=True)
            remove_from_history = set(recon.keys()) - set(gtree.traverse())
            for node in remove_from_history:
                del recon[node]
                if node in events.keys():
                    del events[node]

        if len(gtree) <= 1:
            raise TotalExtinction("All taxa are extinct.")
        return gtree, recon, events, true_event_counter, transfers
Exemple #21
0
    def birth_death_tree(self, birth, death, **kwargs):
        """
        Returns a birth-death tree with birth rate specified by ``birth``, and
        death rate specified by ``death``, and  edge lengths in continuous (real)
        units.

        You can pass  supplemental argument:
        - ``nsize`` : total number of leaves before stopping
        - ``names_library`` : list of names for the leaves
        - ``max_time`` : maximum time for simulation
        - if nsize if not provided, the length of names_library will be used
        - if nsize is larger than ``names_library``, leaves name will be completed with 
        random names in the following format : "T%d" (T1, T2, etc)
        - If `max_time` is given as a keyword argument, tree is grown for
        a maximum of ``max_time``.
        - if `removeloss` is given as argument (default True), extinct taxa are removed

        Under some conditions, it is possible for all lineages on a tree to go extinct.
        In this case, if the keyword argument ``repeat_until_success`` is |True| (default),
        then a new branching process is initiated. Otherwise a TotalExtinction error is raised.

        """

        tree = TreeClass()
        tree.dist = 0.0

        done = False
        removeloss = kwargs.get("removeloss", True)
        repeat_until_success = kwargs.get("repeat_until_success", True)
        names_library = kwargs.get("names_library", [])
        nsize = kwargs.get("nsize", len(names_library))
        max_time = kwargs.get("max_time", None)
        pb_stop = FunctionSlot("birth death stopping")
        if nsize:
            pb_stop.add(stop_with_tree_size)
        if max_time:
            pb_stop.add(stop_with_max_time)
        if pb_stop.isEmpty() and self.stopcrit.isEmpty():
            raise MissingParameterError(
                "Either specify a names_library, nsize, max_time or a stopping criterion"
            )

        extra_param = {}
        for k, v in kwargs.items():
            if k not in ['nsize', 'max_time', 'removeloss']:
                extra_param[k] = v

        extra_param['nsize'] = nsize
        extra_param['max_time'] = max_time

        # initialize tree
        tree = TreeClass()
        tree.dist = 0.0

        #_LOG.debug("Will generate a tree with no more than %s leaves to get a tree of %s leaves" % (str(gsa_ntax), str(nsize)))
        leaf_nodes = tree.get_leaves()
        curr_num_leaves = len(leaf_nodes)

        total_time = 0

        died = set([])
        event_rate = float(birth + death)

        while True:
            # waiting time based on event_rate
            wtime = random.expovariate(event_rate)
            #_LOG.debug("Drew waiting time of %f from hazard parameter of %f" % (wtime, all_rates))

            total_time += wtime
            for leaf in leaf_nodes:
                # extinct leaves cannot update their branches length
                if not leaf.has_feature('name', name=TreeClass.LOST):
                    leaf.dist += wtime

            if not pb_stop.isEmpty():
                for val in pb_stop.applyFunctions(tree,
                                                  cur_time=total_time,
                                                  cur_size=curr_num_leaves,
                                                  **extra_param):
                    done = done or val
            if not self.stopcrit.isEmpty():
                for val in self.stopcrit.applyFunctions(
                        tree,
                        cur_time=total_time,
                        cur_size=curr_num_leaves,
                        **extra_param):
                    done = done or val
            if done:
                break
            # if event occurs within time constraints
            if max_time is None or total_time <= max_time:

                # select node at random, then find chance it died or give birth (speciation)
                node = random.choice(leaf_nodes)
                eprob = random.random()
                leaf_nodes.remove(node)
                curr_num_leaves -= 1
                if eprob < birth / event_rate:
                    #_LOG.debug("Speciation")
                    c1 = TreeClass()
                    c2 = TreeClass()
                    c1.dist = 0
                    c2.dist = 0
                    node.add_features(type=TreeClass.SPEC)
                    node.add_child(c1)
                    node.add_child(c2)
                    leaf_nodes.append(c1)
                    leaf_nodes.append(c2)
                    curr_num_leaves += 2
                else:
                    #_LOG.debug("Extinction")
                    if curr_num_leaves > 0:
                        #_LOG.debug("Will delete " + str(id(nd)) + " with parent = " + str(id(nd.parent_node)))
                        died.add(node)
                        node.add_features(type=TreeClass.LOST)
                    else:
                        if not repeat_until_success:
                            raise TotalExtinction(
                                "All lineage went extinct, please retry")
                        # Restart the simulation because the tree has gone extinct
                        tree = TreeClass()
                        leaf_nodes = tree.get_leaves()
                        curr_num_leaves = 1
                        died = set([])
                        total_time = 0

                # this will always hold true
                assert curr_num_leaves == len(leaf_nodes)

        if removeloss:
            leaves = set(tree.get_leaves()) - died
            tree.prune(leaves)
            tree.delete_single_child_internal(enable_root=True)

        leaf_nodes = tree.get_leaves()
        #wtime = random.expovariate(event_rate)
        leaf_compteur = 1
        nlc = 0
        for ind, node in enumerate(leaf_nodes):
            if not node.has_feature('type', name=TreeClass.LOST):
                #node.dist += wtime
                if nlc < len(names_library):
                    node.name = names_library[nlc]
                    nlc += 1
                else:
                    node.name = "T%d" % leaf_compteur
                    leaf_compteur += 1
        return tree
Exemple #22
0
    def pure_birth_tree(self, birth=1.0, **kwargs):
        """Generates a uniform-rate pure-birth process tree.
        You can pass  supplemental argument:
            - ``nsize`` : total number of leaves before stopping
            - ``names_library`` : list of names for the leaves
            - ``max_time`` : maximum time for simulation
            - if nsize if not provided, the length of names_library will be used
            - if nsize is larger than ``names_library``, leaves name will be completed with 
            random names in the following format : "T%d" (T1, T2, etc)
        and 
        """
        tree = TreeClass()
        tree.dist = 0.0
        # time of waiting
        # compared to parent
        done = False
        tname = kwargs.get("names_library", [])
        nsize = kwargs.get("nsize", len(tname))
        max_time = kwargs.get("max_time", None)
        pb_stop = FunctionSlot("Pure birth stopping")
        if nsize:
            pb_stop.add(stop_with_tree_size)
        if max_time:
            pb_stop.add(stop_with_max_time)
        if pb_stop.isEmpty() and self.stopcrit.isEmpty():
            raise MissingParameterError(
                "Either specify a names_library, nsize, max_time or a stopping criterion"
            )

        extra_param = {}
        for k, v in kwargs.items():
            if k not in ['nsize', 'max_time', 'removeloss']:
                extra_param[k] = v

        extra_param['nsize'] = nsize
        extra_param['max_time'] = max_time

        # fill namespace to desired size
        total_time = 0
        while True:
            # time before new node
            # given the probability of birth
            leaf_nodes = tree.get_leaves()
            wtime = random.expovariate(len(leaf_nodes) / birth)
            total_time += wtime
            for leaf in leaf_nodes:
                leaf.dist += wtime
            if not pb_stop.isEmpty():
                for val in pb_stop.applyFunctions(tree,
                                                  cur_time=total_time,
                                                  cur_size=len(leaf_nodes),
                                                  **extra_param):
                    done = done or val
            if not self.stopcrit.isEmpty():
                for val in self.stopcrit.applyFunctions(
                        tree,
                        cur_time=total_time,
                        cur_size=len(leaf_nodes),
                        **extra_param):
                    done = done or val
            if done:
                break

            if max_time is None or total_time <= max_time:
                # now add new node to a random leaf
                node = random.choice(leaf_nodes)
                c1 = TreeClass()
                c2 = TreeClass()
                node.add_child(c1)
                node.add_child(c2)
                c1.dist = 0.0
                c2.dist = 0.0

        leaf_nodes = tree.get_leaves()
        leaf_compteur = 1
        total_time += wtime
        for ind, node in enumerate(leaf_nodes):
            if ind < len(tname):
                node.name = tname[ind]
            else:
                node.name = "T%d" % leaf_compteur
                leaf_compteur += 1
        return tree
def reconcile(genetree=None, lcaMap=None, lost=False, lost_label_fn=None):
    """Reconcile genetree topology to a specietree, using an adequate mapping obtained with lcaMapping.
    'reconcile' will infer evolutionary events like gene lost, gene speciation and gene duplication with distinction between AD and NAD
    """

    if (lcaMap is None or genetree is None):
        raise Exception("lcaMapping or genetree not found")
    else:
        lost_count = 1
        for node in genetree.traverse("levelorder"):
            node.add_features(type=TreeClass.SPEC)
            node.add_features(dup=False)

            # print node.name , node.species, " and children name ",
            # node.get_children_name()," and children species ",
            # node.get_children_species()
            if (not node.is_leaf()
                    and (lcaMap[node] == lcaMap[node.get_child_at(0)]
                         or lcaMap[node] == lcaMap[node.get_child_at(1)])):
                node.dup = True
                node.type = TreeClass.AD
                # print "\n\nnode = ", node, "\n\nand children : ",
                # node.children
                if not (set(node.get_child_at(0).get_species()).intersection(
                        set(node.get_child_at(1).get_species()))):
                    node.type = TreeClass.NAD

        if (isinstance(lost, basestring)
                and lost.upper() == "YES") or lost == True:
            for node in genetree.traverse("postorder"):
                children_list = node.get_children()
                node_is_dup = (node.type == TreeClass.NAD
                               or node.type == TreeClass.AD)
                for child_c in children_list:
                    if ((node_is_dup and lcaMap[child_c] != lcaMap[node])
                            or (not node_is_dup and
                                (lcaMap[child_c].up != lcaMap[node]))):

                        while ((lcaMap[child_c].up != lcaMap[node]
                                and node.type == TreeClass.SPEC)
                               or (lcaMap[child_c] != lcaMap[node]
                                   and node.type != TreeClass.SPEC)):
                            lostnode = TreeClass()
                            intern_lost = TreeClass()
                            intern_lost.add_features(type=TreeClass.SPEC)
                            intern_lost.add_features(dup=False)

                            if lcaMap[child_c].is_root():
                                intern_lost.species = ",".join(
                                    lcaMap[child_c].get_leaf_names())
                                lcaMap.update({intern_lost: lcaMap[child_c]})

                            else:
                                intern_lost.species = ",".join(
                                    lcaMap[child_c].up.get_leaf_names())
                                lcaMap.update(
                                    {intern_lost: lcaMap[child_c].up})

                            # change here to display a subtree and not a leaf
                            # with a lot of specie
                            lostnode.species = ",".join(
                                set(lcaMap[intern_lost].get_leaf_names()) -
                                set(lcaMap[child_c].get_leaf_names()))
                            splist = lostnode.species.split(',')
                            if (len(splist) > 1):
                                if lost_label_fn:
                                    lostnode.name = lost_label_fn(splist)
                                else:
                                    lostnode.name = "lost_" + \
                                        str(lost_count) + "_" + \
                                        "|".join([s[0:3] for s in splist])

                            else:
                                if lost_label_fn:
                                    lostnode.name = lost_label_fn(
                                        lostnode.species)
                                else:
                                    lostnode.name = "lost_" + lostnode.species

                            lostnode.add_features(type=TreeClass.LOST)
                            lostnode.add_features(dup=False)

                            lost_count += 1
                            child_c.detach()
                            # print "***********************\n\n** node : ", node, "\n\n** child_c: ", child_c, "\n\n** child parent", child_c.up
                            # node.remove_child(child_c)
                            intern_lost.add_child(child=lostnode)
                            intern_lost.add_child(child=child_c)
                            child_c = intern_lost
                        node.add_child(child_c)
                        children_list.append(child_c)

                # Case of polytomie in species tree....
                if not node.is_leaf():
                    specie_list = ",".join([
                        ",".join(lcaMap[child_c].get_leaf_names())
                        for child_c in node.get_children()
                    ])
                    child_specie_set = set(specie_list.split(","))
                    real_specie_list = set(lcaMap[node].get_leaf_names())
                    unadded_specie = real_specie_list - child_specie_set
                    # print unadded_specie, child_specie_set, real_specie_list
                    # print node.species
                    if (unadded_specie):
                        lostnode = TreeClass()
                        lostnode.add_features(type=TreeClass.LOST)
                        lostnode.add_features(dup=False)
                        lostnode.species = ",".join(unadded_specie)

                        if (len(unadded_specie) > 1):
                            lostnode.name = "lost_" + \
                                           str(lost_count) + "_" + \
                                        "|".join([s[0:3] for s in unadded_specie])

                        else:
                            lostnode.name = "lost_" + lostnode.species

                        lost_count += 1
                        node.add_child(lostnode)
    genetree.add_features(reconciled=True)
Exemple #24
0
    def pure_birth_tree(self, birth=1.0, **kwargs):
        """Generates a uniform-rate pure-birth process tree.
        You can pass  supplemental argument:
            - ``nsize`` : total number of leaves before stopping
            - ``names_library`` : list of names for the leaves
            - ``max_time`` : maximum time for simulation
            - if nsize if not provided, the length of names_library will be used
            - if nsize is larger than ``names_library``, leaves name will be completed with
            random names in the following format : "T%d" (T1, T2, etc)
        and
        """
        tree = TreeClass()
        tree.dist = 0.0
        # time of waiting
        # compared to parent
        done = False
        tname = kwargs.get("names_library", [])
        nsize = kwargs.get("nsize", len(tname))
        max_time = kwargs.get("max_time", None)
        pb_stop = FunctionSlot("Pure birth stopping")
        if nsize:
            pb_stop.add(stop_with_tree_size)
        if max_time:
            pb_stop.add(stop_with_max_time)
        if pb_stop.isEmpty() and self.stopcrit.isEmpty():
            raise MissingParameterError(
                "Either specify a names_library, nsize, max_time or a stopping criterion")

        extra_param = {}
        for k, v in kwargs.items():
            if k not in ['nsize', 'max_time', 'removeloss']:
                extra_param[k] = v

        extra_param['nsize'] = nsize
        extra_param['max_time'] = max_time

        # fill namespace to desired size
        total_time = 0
        while True:
            # time before new node
            # given the probability of birth
            leaf_nodes = tree.get_leaves()
            wtime = random.expovariate(len(leaf_nodes) / birth)
            total_time += wtime
            for leaf in leaf_nodes:
                leaf.dist += wtime
            if not pb_stop.isEmpty():
                for val in pb_stop.applyFunctions(tree, cur_time=total_time, cur_size=len(leaf_nodes), **extra_param):
                    done = done or val
            if not self.stopcrit.isEmpty():
                for val in self.stopcrit.applyFunctions(tree, cur_time=total_time, cur_size=len(leaf_nodes), **extra_param):
                    done = done or val
            if done:
                break

            if max_time is None or total_time <= max_time:
                # now add new node to a random leaf
                node = random.choice(leaf_nodes)
                c1 = TreeClass()
                c2 = TreeClass()
                node.add_child(c1)
                node.add_child(c2)
                c1.dist = 0.0
                c2.dist = 0.0

        leaf_nodes = tree.get_leaves()
        leaf_compteur = 1
        total_time += wtime
        for ind, node in enumerate(leaf_nodes):
            if ind < len(tname):
                node.name = tname[ind]
            else:
                node.name = "T%d" % leaf_compteur
                leaf_compteur += 1
        return tree
Exemple #25
0
        def event_in_time(time, node, spnode):
            # time for an event
            if event_rate == 0.0:
                next_t = INF
            else:
                next_t = random.expovariate(event_rate)

            if next_t > time:
                # no event on branch
                # we can stop
                node.dist = time
                node.add_features(type=INF)

            else:
                eprob = random.random()
                node.dist = next_t
                if eprob < birth * 1.0 / event_rate:
                    # birth ==> duplication event
                    cnode = TreeClass()
                    node.add_child(cnode)
                    map_to_spec[cnode] = spnode
                    event_in_time(time - next_t, cnode, spnode)
                    # compute event on the remaining time
                    cnode = TreeClass()
                    node.add_child(cnode)
                    map_to_spec[cnode] = spnode
                    event_in_time(time - next_t, cnode, spnode)
                    node.add_features(type=TreeClass.AD)
                    ecounter['dup'] += 1

                elif eprob < (birth + death) * 1.0 / event_rate:
                    # death happen ==> loss
                    node.add_features(type=TreeClass.LOST)
                    map_to_spec[node] = spnode
                    ecounter['loss'] += 1
                    died.add(node)
                else:
                    # give gene to another species ==> transfer
                    contemp_transfer_nodes = list(
                        spnode.get_incomparable_list(timeconsistent=True, wtime=next_t))
                    if contemp_transfer_nodes and not(ign_suc_trn and node.up and node.up.has_feature('type', name=TreeClass.TRANSFER)):
                        cand_receiver = random.choice(contemp_transfer_nodes)

                        node.add_features(type=TreeClass.TRANSFER)
                        ecounter['transfer'] += 1
                        cnode = TreeClass()
                        node.add_child(cnode)
                        map_to_spec[cnode] = spnode
                        event_in_time(time - next_t, cnode, spnode)

                        cnode = TreeClass()
                        node.add_child(cnode)
                        cnode.add_features(transfered=True)
                        transfered[cnode] = cand_receiver
                        map_to_spec[cnode] = cand_receiver
                        t = cand_receiver.brlen - spnode.brlen + time - next_t
                        event_in_time(t, cnode, cand_receiver)
                    else:
                        # keep node as it is
                        # so speciation
                        node.add_features(type=INF)

                        self.debug_msg("Could not perform transfer at node")
                        self.debug_msg(node)
Exemple #26
0
def polySolverPreprocessing(genetree, specietree, distance_mat, capitalize=False, gene_sep=None, specie_pos="postfix", nFlagVal=1e305, nFlag=False, smap=None, errorproof=False):
    """Preprocess genetree for polytomysolver
    """

    # genetree input
    speciemap = None
    if isinstance(genetree, basestring) and not smap:
        genetree, gene_sep = newickPreprocessing(genetree, gene_sep)
        genetree = TreeClass(genetree)

    elif smap:
        if isinstance(smap, dict):
            speciemap = smap
        else:
            genetree = TreeClass(genetree) if isinstance(
                genetree, basestring) else genetree
            regexmap = {}
            speciemap = {}
            with open(smap, 'rU') if isinstance(smap, basestring) else smap as INPUT:
                for line in INPUT:
                    g, s = line.strip().split()
                    if ('*') in g and '.*' not in g:
                        g = g.replace('*', '.*')
                    g_regex = re.compile(g, re.IGNORECASE)
                    regexmap[g_regex] = s

            for leaf in genetree:
                for key, value in regexmap.iteritems():
                    if key.match(leaf.name):
                        speciemap[leaf.name] = value

    genetree.set_species(
        speciesMap=speciemap, sep=gene_sep, capitalize=capitalize, pos=specie_pos)

    # genetree check
    if len(genetree) != len(set(genetree.get_leaf_names())):
        tmp_leaf_name = genetree.get_leaf_names()
        duplicates = set(
            [x for x in tmp_leaf_name if tmp_leaf_name.count(x) > 1])
        raise ValueError(
            "Your polytomy contains the following gene multiple times : %s" % ", ".join(duplicates))

    # specietree input
    if isinstance(specietree, basestring):
        specietree, sep = newickPreprocessing(specietree, '')
        specietree = TreeClass(specietree)
    specietree.label_internal_node()

    # distance matrice input
    if(distance_mat):
        if isinstance(distance_mat, basestring):
            gene_matrix, node_order = clu.distMatProcessor(
                distance_mat, nFlagVal, nFlag)
        else:
            # distance mat is provided as a boolean
            # in that case, just try to get it from the genetree
            gene_matrix, node_order = get_distance_from_tree(genetree)
        # Difference check 1
        # pos = node_order.index('ENSDORP00000008194_dordii')
        # print node_order
        # print gene_matrix[pos, :]
        listerr = set(node_order).symmetric_difference(
            set(genetree.get_leaf_names()))
        if listerr:
            if not errorproof:
                raise ValueError(
                    "Different genes in distance matrix and genetree\n : See symmetric difference : %s\n" % ", ".join(listerr))
            else:
                if gene_sep:
                    resetNodeName(genetree, gene_sep, specie_pos == 'postfix')
                else:
                    exib1 = set(node_order).difference(
                        set(genetree.get_leaf_names()))
                    exib2 = set(genetree.get_leaf_names()
                                ).difference(set(node_order))
                    if exib2:
                        raise Exception(
                            'Genes in trees and not in matrix : %s' % (exib2))
                    elif exib1:
                        print("Genes in matrix and not in tree : %s \nAttempt to correct distance matrix" % (
                            ", ".join(exib1)))
                        for l in exib1:
                            try:
                                lpos = node_order.index(l)
                                gene_matrix = clu.remove_ij(
                                    gene_matrix, lpos, lpos)
                                del node_order[lpos]
                            except:
                                raise IndexError(
                                    "Could not remove gene %s from distance matrix" % l)

    else:
        # This is for debug, will never happen
        raise ValueError(
            "distance matrix not provided and could not be infered from tree")
        # gene_matrix = clu.makeFakeDstMatrice(len(node_order), 0, 1)

    # Find list of species in genetree but not in specietree
    specieGeneList = set(genetree.get_leaf_species())
    specieList = set([x.name for x in specietree.get_leaves()])
    if(specieGeneList - specieList):
        if len(specieGeneList.intersection(specieList)) == 0 and gene_sep:
            raise Exception(
                "*** You probably didn't set the correct species position for you input tree !!")
        raise Exception("Species in genetree but not in specietree : %s" % (
            ", ".join(specieGeneList - specieList)))

    return genetree, specietree, gene_matrix, node_order
Exemple #27
0
    def sample_from_tree(self, sptree, birth, death, gain, **kwargs):
        """Sample a tree within another tree using the rate specified
            Note that a tree with all leaves being extinct can be returned by this function
            Use dlt_tree_from_sptree if you want to prevent this.
        """

        # initialize gene tree
        sptree.compute_branches_length()
        sptree.label_internal_node()
        removeloss = kwargs.get("removeloss", True)
        disallow_suc_trn = kwargs.get("disallow_suc_trn", True)
        leave_names = kwargs.get("names_library", [])

        gtree = TreeClass()
        recon = {gtree: sptree}
        events = {gtree: "spec"}
        losses = set()
        transfers = {}
        true_event_counter = ddict(int)
        snode_counter = ddict(int)
        if not leave_names:
            leave_names = lambda sp, x: sp + "_" + str(x)
        name_counter = 0

        def create_history(snode, gnode):
            if snode.is_leaf():
                if isinstance(leave_names, list):
                    n_encounter = name_counter / len(leave_names)
                    gnode.name = leave_names[name_counter % len(leaves_name)]
                    gnode.name += ("_" + gnode.name) * n_encounter
                else:
                    snode_counter[snode.name] += 1
                    gnode.name = leave_names(
                        snode.name, snode_counter[snode.name])
                events[gnode] = "leaf"
                gnode.add_features(type=TreeClass.SPEC)
            else:
                for schild in snode.get_children():
                    # get branches event for branch (snode, schild)
                    # during time = schild.dist
                    recnode, died, transfered, smap = self.sample_event_on_branches(schild.dist,
                                                                                    schild, birth, death, gain, keeplosses=(not removeloss), ign_suc_trn=disallow_suc_trn, ecounter=true_event_counter)
                    gnode.add_child(recnode)
                    # update ist of losses
                    losses.update(died)
                    transfers.update(transfered)
                    recon.update(smap)
                    next_cand = []
                    # then record reconciliation that happened
                    # print recnode.get_ascii(attributes=[], show_internal=True)
                    # print schild
                    for node in recnode.traverse():
                        node.add_features(species=recon[node].name)
                        if node.type == TreeClass.LOST:
                            events[node] = "loss"
                            # died at the start of coalescence
                            losses.add(node)
                        elif node.is_leaf():
                            node.add_features(type=TreeClass.SPEC)
                            events[node] = "spec"
                            next_cand.append(node)
                        elif node.type == TreeClass.AD:
                            events[node] = "dup"
                        else:
                            events[node] = "transfer"

                    for new_node in next_cand:
                        create_history(recon[new_node], new_node)

                # if no child for node then it is a loss
                if gnode.is_leaf():

                    losses.add(gnode)
        create_history(sptree, gtree)

        gtree.delete_single_child_internal(enable_root=True)
        if removeloss:
            for node in gtree.traverse():
                if node in losses:
                    node.delete()
            gtree.delete_single_child_internal(enable_root=True)
            remove_from_history = set(recon.keys()) - set(gtree.traverse())
            for node in remove_from_history:
                del recon[node]
                if node in events.keys():
                    del events[node]

        if len(gtree) <= 1:
            raise TotalExtinction("All taxa are extinct.")
        return gtree, recon, events, true_event_counter, transfers
Exemple #28
0
    def birth_death_tree(self, birth, death, **kwargs):
        """
        Returns a birth-death tree with birth rate specified by ``birth``, and
        death rate specified by ``death``, and  edge lengths in continuous (real)
        units.

        You can pass  supplemental argument:
        - ``nsize`` : total number of leaves before stopping
        - ``names_library`` : list of names for the leaves
        - ``max_time`` : maximum time for simulation
        - if nsize if not provided, the length of names_library will be used
        - if nsize is larger than ``names_library``, leaves name will be completed with
        random names in the following format : "T%d" (T1, T2, etc)
        - If `max_time` is given as a keyword argument, tree is grown for
        a maximum of ``max_time``.
        - if `removeloss` is given as argument (default True), extinct taxa are removed

        Under some conditions, it is possible for all lineages on a tree to go extinct.
        In this case, if the keyword argument ``repeat_until_success`` is |True| (default),
        then a new branching process is initiated. Otherwise a TotalExtinction error is raised.

        """

        tree = TreeClass()
        tree.dist = 0.0

        done = False
        removeloss = kwargs.get("removeloss", True)
        repeat_until_success = kwargs.get("repeat_until_success", True)
        names_library = kwargs.get("names_library", [])
        nsize = kwargs.get("nsize", len(names_library))
        max_time = kwargs.get("max_time", None)
        pb_stop = FunctionSlot("birth death stopping")
        if nsize:
            pb_stop.add(stop_with_tree_size)
        if max_time:
            pb_stop.add(stop_with_max_time)
        if pb_stop.isEmpty() and self.stopcrit.isEmpty():
            raise MissingParameterError(
                "Either specify a names_library, nsize, max_time or a stopping criterion")

        extra_param = {}
        for k, v in kwargs.items():
            if k not in ['nsize', 'max_time', 'removeloss']:
                extra_param[k] = v

        extra_param['nsize'] = nsize
        extra_param['max_time'] = max_time

        # initialize tree
        tree = TreeClass()
        tree.dist = 0.0

        # LOG.debug("Will generate a tree with no more than %s leaves to get a tree of %s leaves" % (str(gsa_ntax), str(nsize)))
        leaf_nodes = tree.get_leaves()
        curr_num_leaves = len(leaf_nodes)

        total_time = 0

        died = set([])
        event_rate = float(birth + death)

        while True:
            # waiting time based on event_rate
            wtime = random.expovariate(event_rate)
            # _LOG.debug("Drew waiting time of %f from hazard parameter of %f" % (wtime, all_rates))

            total_time += wtime
            for leaf in leaf_nodes:
                # extinct leaves cannot update their branches length
                if not leaf.has_feature('name', name=TreeClass.LOST):
                    leaf.dist += wtime

            if not pb_stop.isEmpty():
                for val in pb_stop.applyFunctions(tree, cur_time=total_time, cur_size=curr_num_leaves, **extra_param):
                    done = done or val
            if not self.stopcrit.isEmpty():
                for val in self.stopcrit.applyFunctions(tree, cur_time=total_time, cur_size=curr_num_leaves, **extra_param):
                    done = done or val
            if done:
                break
            # if event occurs within time constraints
            if max_time is None or total_time <= max_time:

                # select node at random, then find chance it died or give birth
                # (speciation)
                node = random.choice(leaf_nodes)
                eprob = random.random()
                leaf_nodes.remove(node)
                curr_num_leaves -= 1
                if eprob < birth / event_rate:
                    # LOG.debug("Speciation")
                    c1 = TreeClass()
                    c2 = TreeClass()
                    c1.dist = 0
                    c2.dist = 0
                    node.add_features(type=TreeClass.SPEC)
                    node.add_child(c1)
                    node.add_child(c2)
                    leaf_nodes.append(c1)
                    leaf_nodes.append(c2)
                    curr_num_leaves += 2
                else:
                    # LOG.debug("Extinction")
                    if curr_num_leaves > 0:
                        # LOG.debug("Will delete " + str(id(nd)) + " with parent = " + str(id(nd.parent_node)))
                        died.add(node)
                        node.add_features(type=TreeClass.LOST)
                    else:
                        if not repeat_until_success:
                            raise TotalExtinction(
                                "All lineage went extinct, please retry")
                        # Restart the simulation because the tree has gone
                        # extinct
                        tree = TreeClass()
                        leaf_nodes = tree.get_leaves()
                        curr_num_leaves = 1
                        died = set([])
                        total_time = 0

                # this will always hold true
                assert curr_num_leaves == len(leaf_nodes)

        if removeloss:
            leaves = set(tree.get_leaves()) - died
            tree.prune(leaves)
            tree.delete_single_child_internal(enable_root=True)

        leaf_nodes = tree.get_leaves()
        # wtime = random.expovariate(event_rate)
        leaf_compteur = 1
        nlc = 0
        for ind, node in enumerate(leaf_nodes):
            if not node.has_feature('type', name=TreeClass.LOST):
                # node.dist += wtime
                if nlc < len(names_library):
                    node.name = names_library[nlc]
                    nlc += 1
                else:
                    node.name = "T%d" % leaf_compteur
                    leaf_compteur += 1
        return tree