def pure_kingman(taxon_set, pop_size=1, rng=None): """ Generates a tree under the unconstrained Kingman's coalescent process. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default nodes = [dataobject.Node(taxon=t) for t in taxon_set] seed_node = coalescent.coalesce(nodes=nodes, pop_size=pop_size, period=None, rng=rng, use_expected_tmrca=True)[0] tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node) return tree
def mean_kingman(taxon_set, pop_size=1): """ Returns a tree with coalescent intervals given by the expected times under Kingman's neutral coalescent. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default nodes = [dataobject.Node(taxon=t) for t in taxon_set] seed_node = coalescent.coalesce(nodes=nodes, pop_size=pop_size, period=None, rng=rng, use_expected_tmrca=True)[0] tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node) return tree
def pure_kingman(taxon_set, pop_size=1, rng=None): """ Generates a tree under the unconstrained Kingman's coalescent process. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default nodes = [dataobject.Node(taxon=t) for t in taxon_set] seed_node = coalescent.coalesce(nodes=nodes, pop_size=pop_size, period=None, rng=rng)[0] tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node) return tree
def constrained_kingman( pop_tree, gene_tree_list=None, rng=None, gene_node_label_func=None, num_genes_attr="num_genes", pop_size_attr="pop_size", decorate_original_tree=False, ): """ Given a population tree, `pop_tree` this will return a *pair of trees*: a gene tree simulated on this population tree based on Kingman's n-coalescent, and population tree with the additional attribute 'gene_nodes' on each node, which is a list of uncoalesced nodes from the gene tree associated with the given node from the population tree. `pop_tree` should be a DendroPy Tree object or an object of a class derived from this with the following attribute `num_genes` -- the number of gene samples from each population in the present. Each edge on the tree should also have the attribute `pop_size_attr` is the attribute name of the edges of `pop_tree` that specify the population size. By default it is `pop_size`. The should specify the effective *haploid* population size; i.e., number of gene in the population: 2 * N in a diploid population of N individuals, or N in a haploid population of N individuals. If `pop_size` is 1 or 0 or None, then the edge lengths of `pop_tree` is taken to be in haploid population units; i.e. where 1 unit equals 2N generations for a diploid population of size N, or N generations for a haploid population of size N. Otherwise the edge lengths of `pop_tree` is taken to be in generations. If `gene_tree_list` is given, then the gene tree is added to the tree block, and the tree block's taxa block will be used to manage the gene tree's `taxa`. `gene_node_label_func` is a function that takes two arguments (a string and an integer, respectively, where the string is the containing species taxon label and the integer is the gene index) and returns a label for the corresponding the gene node. if `decorate_original_tree` is True, then the list of uncoalesced nodes at each node of the population tree is added to the original (input) population tree instead of a copy. Note that this function does very much the same thing as `contained_coalescent()`, but provides a very different API. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default if gene_tree_list is not None: gtaxa = gene_tree_list.taxon_set else: gtaxa = dataobject.TaxonSet() if gene_node_label_func is None: gene_node_label_func = lambda x, y: "%s_%02d" % (x, y) # we create a set of gene nodes for each leaf node on the population # tree, and associate those gene nodes to the leaf by assignment # of 'taxon'. for leaf_count, leaf in enumerate(pop_tree.leaf_iter()): gene_nodes = [] for gene_count in range(getattr(leaf, num_genes_attr)): gene_node = dataobject.Node() gene_node.taxon = gtaxa.require_taxon(label=gene_node_label_func(leaf.taxon.label, gene_count + 1)) gene_nodes.append(gene_node) leaf.gene_nodes = gene_nodes # We iterate through the edges of the population tree in post-order, # i.e., visiting child edges before we visit parent edges. For # each edge visited, we take the genes found in the child nodes, # and run the coalescent simulation on them attacheded by the length # of the edge. Any genes that have not yet coalesced at the end of # this period are added to the genes of the tail (parent) node of # the edge. if decorate_original_tree: working_poptree = pop_tree else: # start with a new (deep) copy of the population tree so as to not # to change the original tree working_poptree = dataobject.Tree(pop_tree) # start with a new tree gene_tree = dataobject.Tree() gene_tree.taxon_set = gtaxa for edge in working_poptree.postorder_edge_iter(): # if mrca root, run unconstrained coalescent if edge.head_node.parent_node is None: if len(edge.head_node.gene_nodes) > 1: final = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=None, rng=rng) else: final = edge.head_node.gene_nodes gene_tree.seed_node = final[0] else: if hasattr(edge, pop_size_attr): pop_size = getattr(edge, pop_size_attr) else: # this means all our time will be in population units pop_size = 1 uncoal = coalescent.coalesce( nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=edge.length, rng=rng ) if not hasattr(edge.tail_node, "gene_nodes"): edge.tail_node.gene_nodes = [] edge.tail_node.gene_nodes.extend(uncoal) gene_tree.is_rooted = True if gene_tree_list is not None: gene_tree_list.append(gene_tree) return gene_tree, working_poptree else: return gene_tree, working_poptree
def contained_coalescent( containing_tree, gene_to_containing_taxon_map, edge_pop_size_attr="pop_size", default_pop_size=1, rng=None ): """ Returns a gene tree simulated under the coalescent contained within a population or species tree. `containing_tree` The population or species tree. If `edge_pop_size_map` is not None, and population sizes given are non-trivial (i.e., >1), then edge lengths on this tree are in units of generations. Otherwise edge lengths are in population units; i.e. 2N generations for diploid populations of size N, or N generations for diploid populations of size N. `gene_to_containing_taxon_map` A TaxonSetMapping object mapping Taxon objects in the `containing_tree` TaxonSet to corresponding Taxon objects in the resulting gene tree. `edge_pop_size_attr` Name of attribute of edges that specify population size. By default this is "pop_size". If this attribute does not exist, `default_pop_size` will be used. The value for this attribute should be the haploid population size or the number of genes; i.e. 2N for a diploid population of N individuals, or N for a haploid population of N individuals. This value determines how branch length units are interpreted in the input tree, `containing_tree`. If a biologically-meaningful value, then branch lengths on the `containing_tree` are properly read as generations. If not (e.g. 1 or 0), then they are in population units, i.e. where 1 unit of time equals 2N generations for a diploid population of size N, or N generations for a haploid population of size N. Otherwise time is in generations. If this argument is None, then population sizes default to `default_pop_size`. `default_pop_size` Population size to use if `edge_pop_size_attr` is None or if an edge does not have the attribute. Defaults to 1. The returned gene tree will have the following extra attributes: `pop_node_genes` A dictionary with nodes of `containing_tree` as keys and a list of gene tree nodes that are uncoalesced as values. Note that this function does very much the same thing as `constrained_kingman()`, but provides a very different API. """ if rng is None: rng = GLOBAL_RNG gene_tree_taxon_set = gene_to_containing_taxon_map.domain_taxon_set if gene_tree_taxon_set is None: gene_tree_taxon_set = dendropy.TaxonSet() for gene_taxa in pop_gene_taxa_map: for taxon in gene_taxa: gene_tree_taxon_set.add(taxon) gene_tree = dataobject.Tree(taxon_set=gene_tree_taxon_set) gene_tree.is_rooted = True pop_node_genes = {} pop_gene_taxa = gene_to_containing_taxon_map.reverse for nd in containing_tree.postorder_node_iter(): if nd.taxon and nd.taxon in pop_gene_taxa: pop_node_genes[nd] = [] gene_taxa = pop_gene_taxa[nd.taxon] for gene_taxon in gene_taxa: gene_node = dataobject.Node() gene_node.taxon = gene_taxon pop_node_genes[nd].append(gene_node) # gene_nodes = [dataobject.Node() for i in range(len(gene_taxa))] # for gidx, gene_node in enumerate(gene_nodes): # gene_node.taxon = gene_taxa[gidx] # pop_node_genes[nd].append(gene_node) for edge in containing_tree.postorder_edge_iter(): if edge_pop_size_attr and hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size if edge.head_node.parent_node is None: if len(pop_node_genes[edge.head_node]) > 1: final = coalescent.coalesce( nodes=pop_node_genes[edge.head_node], pop_size=default_pop_size, period=None, rng=rng ) else: final = pop_node_genes[edge.head_node] gene_tree.seed_node = final[0] else: uncoal = coalescent.coalesce( nodes=pop_node_genes[edge.head_node], pop_size=pop_size, period=edge.length, rng=rng ) if edge.tail_node not in pop_node_genes: pop_node_genes[edge.tail_node] = [] pop_node_genes[edge.tail_node].extend(uncoal) gene_tree.pop_node_genes = pop_node_genes return gene_tree
def constrained_kingman(pop_tree, gene_tree_list=None, rng=None, gene_node_label_func=None, num_genes_attr='num_genes', pop_size_attr='pop_size', decorate_original_tree=False): """ Given a population tree, `pop_tree` this will return a *pair of trees*: a gene tree simulated on this population tree based on Kingman's n-coalescent, and population tree with the additional attribute 'gene_nodes' on each node, which is a list of uncoalesced nodes from the gene tree associated with the given node from the population tree. `pop_tree` should be a DendroPy Tree object or an object of a class derived from this with the following attribute `num_genes` -- the number of gene samples from each population in the present. Each edge on the tree should also have the attribute `pop_size_attr` is the attribute name of the edges of `pop_tree` that specify the population size. By default it is `pop_size`. The should specify the effective *haploid* population size; i.e., number of gene in the population: 2 * N in a diploid population of N individuals, or N in a haploid population of N individuals. If `pop_size` is 1 or 0 or None, then the edge lengths of `pop_tree` is taken to be in haploid population units; i.e. where 1 unit equals 2N generations for a diploid population of size N, or N generations for a haploid population of size N. Otherwise the edge lengths of `pop_tree` is taken to be in generations. If `gene_tree_list` is given, then the gene tree is added to the tree block, and the tree block's taxa block will be used to manage the gene tree's `taxa`. `gene_node_label_func` is a function that takes two arguments (a string and an integer, respectively, where the string is the containing species taxon label and the integer is the gene index) and returns a label for the corresponding the gene node. if `decorate_original_tree` is True, then the list of uncoalesced nodes at each node of the population tree is added to the original (input) population tree instead of a copy. Note that this function does very much the same thing as `contained_coalescent()`, but provides a very different API. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default if gene_tree_list is not None: gtaxa = gene_tree_list.taxon_set else: gtaxa = dataobject.TaxonSet() if gene_node_label_func is None: gene_node_label_func = lambda x, y: "%s_%02d" % (x, y) # we create a set of gene nodes for each leaf node on the population # tree, and associate those gene nodes to the leaf by assignment # of 'taxon'. for leaf_count, leaf in enumerate(pop_tree.leaf_iter()): gene_nodes = [] for gene_count in range(getattr(leaf, num_genes_attr)): gene_node = dataobject.Node() gene_node.taxon = gtaxa.require_taxon( label=gene_node_label_func(leaf.taxon.label, gene_count + 1)) gene_nodes.append(gene_node) leaf.gene_nodes = gene_nodes # We iterate through the edges of the population tree in post-order, # i.e., visiting child edges before we visit parent edges. For # each edge visited, we take the genes found in the child nodes, # and run the coalescent simulation on them attacheded by the length # of the edge. Any genes that have not yet coalesced at the end of # this period are added to the genes of the tail (parent) node of # the edge. if decorate_original_tree: working_poptree = pop_tree else: # start with a new (deep) copy of the population tree so as to not # to change the original tree working_poptree = copy.deepcopy(pop_tree) # start with a new tree gene_tree = dataobject.Tree() gene_tree.taxon_set = gtaxa for edge in working_poptree.postorder_edge_iter(): # if mrca root, run unconstrained coalescent if edge.head_node.parent_node is None: if len(edge.head_node.gene_nodes) > 1: final = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=None, rng=rng) else: final = edge.head_node.gene_nodes gene_tree.seed_node = final[0] else: if hasattr(edge, pop_size_attr): pop_size = getattr(edge, pop_size_attr) else: # this means all our time will be in population units pop_size = 1 uncoal = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=edge.length, rng=rng) if not hasattr(edge.tail_node, 'gene_nodes'): edge.tail_node.gene_nodes = [] edge.tail_node.gene_nodes.extend(uncoal) gene_tree.is_rooted = True if gene_tree_list is not None: gene_tree_list.append(gene_tree) return gene_tree, working_poptree else: return gene_tree, working_poptree
def contained_coalescent(containing_tree, gene_to_containing_taxon_map, edge_pop_size_attr="pop_size", default_pop_size=1, rng=None): """ Returns a gene tree simulated under the coalescent contained within a population or species tree. `containing_tree` The population or species tree. If `edge_pop_size_map` is not None, and population sizes given are non-trivial (i.e., >1), then edge lengths on this tree are in units of generations. Otherwise edge lengths are in population units; i.e. 2N generations for diploid populations of size N, or N generations for diploid populations of size N. `gene_to_containing_taxon_map` A TaxonSetMapping object mapping Taxon objects in the `containing_tree` TaxonSet to corresponding Taxon objects in the resulting gene tree. `edge_pop_size_attr` Name of attribute of edges that specify population size. By default this is "pop_size". If this attribute does not exist, `default_pop_size` will be used. The value for this attribute should be the haploid population size or the number of genes; i.e. 2N for a diploid population of N individuals, or N for a haploid population of N individuals. This value determines how branch length units are interpreted in the input tree, `containing_tree`. If a biologically-meaningful value, then branch lengths on the `containing_tree` are properly read as generations. If not (e.g. 1 or 0), then they are in population units, i.e. where 1 unit of time equals 2N generations for a diploid population of size N, or N generations for a haploid population of size N. Otherwise time is in generations. If this argument is None, then population sizes default to `default_pop_size`. `default_pop_size` Population size to use if `edge_pop_size_attr` is None or if an edge does not have the attribute. Defaults to 1. The returned gene tree will have the following extra attributes: `pop_node_genes` A dictionary with nodes of `containing_tree` as keys and a list of gene tree nodes that are uncoalesced as values. Note that this function does very much the same thing as `constrained_kingman()`, but provides a very different API. """ if rng is None: rng = GLOBAL_RNG gene_tree_taxon_set = gene_to_containing_taxon_map.domain_taxon_set if gene_tree_taxon_set is None: gene_tree_taxon_set = dendropy.TaxonSet() for gene_taxa in pop_gene_taxa_map: for taxon in gene_taxa: gene_tree_taxon_set.add(taxon) gene_tree = dataobject.Tree(taxon_set=gene_tree_taxon_set) gene_tree.is_rooted = True pop_node_genes = {} pop_gene_taxa = gene_to_containing_taxon_map.reverse for nd in containing_tree.postorder_node_iter(): if nd.taxon and nd.taxon in pop_gene_taxa: pop_node_genes[nd] = [] gene_taxa = pop_gene_taxa[nd.taxon] for gene_taxon in gene_taxa: gene_node = dataobject.Node() gene_node.taxon = gene_taxon pop_node_genes[nd].append(gene_node) #gene_nodes = [dataobject.Node() for i in range(len(gene_taxa))] #for gidx, gene_node in enumerate(gene_nodes): # gene_node.taxon = gene_taxa[gidx] # pop_node_genes[nd].append(gene_node) for edge in containing_tree.postorder_edge_iter(): if edge_pop_size_attr and hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size if edge.head_node.parent_node is None: if len(pop_node_genes[edge.head_node]) > 1: final = coalescent.coalesce( nodes=pop_node_genes[edge.head_node], pop_size=default_pop_size, period=None, rng=rng) else: final = pop_node_genes[edge.head_node] gene_tree.seed_node = final[0] else: uncoal = coalescent.coalesce(nodes=pop_node_genes[edge.head_node], pop_size=pop_size, period=edge.length, rng=rng) if edge.tail_node not in pop_node_genes: pop_node_genes[edge.tail_node] = [] pop_node_genes[edge.tail_node].extend(uncoal) gene_tree.pop_node_genes = pop_node_genes return gene_tree
def simulate_contained_kingman(self, edge_pop_size_attr='pop_size', default_pop_size=1, label=None, rng=None): """ Simulates and returns a "censored" (Kingman) neutral coalescence tree conditional on self. ``rng`` Random number generator to use. If ``None``, the default will be used. ``edge_pop_size_attr`` Name of attribute of self's edges that specify the population size. If this attribute does not exist, then the population size is taken to be 1. Note that all edge-associated taxon sets must be up-to-date (otherwise, ``build_edge_taxa_sets()`` should be called), and that the tree is *not* added to the set of embedded trees. For the latter, call ``embed_contained_kingman``. """ # Dictionary that maps nodes of containing tree to list of # corresponding nodes on gene tree, initially populated with leaf # nodes. embedded_nodes = {} for nd in self.leaf_iter(): embedded_nodes[nd] = [] for gt in nd.edge.embedded_taxa: gn = dataobject.Node(taxon=gt) embedded_nodes[nd].append(gn) # Generate the tree structure for edge in self.postorder_edge_iter(): if edge.head_node.parent_node is None: # root: run unconstrained coalescence until just one gene node # remaining if hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size if len(embedded_nodes[edge.head_node]) > 1: final = coalescent.coalesce(nodes=embedded_nodes[edge.head_node], pop_size=pop_size, period=None, rng=rng) else: final = embedded_nodes[edge.head_node] else: # run until next coalescence event, as determined by this edge # size. if hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size remaining = coalescent.coalesce(nodes=embedded_nodes[edge.head_node], pop_size=pop_size, period=edge.length, rng=rng) try: embedded_nodes[edge.tail_node].extend(remaining) except KeyError: embedded_nodes[edge.tail_node] = remaining # Create and return the full tree embedded_tree = dataobject.Tree(taxon_set=self.embedded_taxon_set, label=label) embedded_tree.seed_node = final[0] embedded_tree.is_rooted = True return embedded_tree
def simulate_contained_kingman(self, edge_pop_size_attr='pop_size', default_pop_size=1, label=None, rng=None): """ Simulates and returns a "censored" (Kingman) neutral coalescence tree conditional on self. ``rng`` Random number generator to use. If ``None``, the default will be used. ``edge_pop_size_attr`` Name of attribute of self's edges that specify the population size. If this attribute does not exist, then the population size is taken to be 1. Note that all edge-associated taxon sets must be up-to-date (otherwise, ``build_edge_taxa_sets()`` should be called), and that the tree is *not* added to the set of contained trees. For the latter, call ``embed_contained_kingman``. """ # Dictionary that maps nodes of containing tree to list of # corresponding nodes on gene tree, initially populated with leaf # nodes. contained_nodes = {} for nd in self.leaf_iter(): contained_nodes[nd] = [] for gt in nd.edge.contained_taxa: gn = dataobject.Node(taxon=gt) contained_nodes[nd].append(gn) # Generate the tree structure for edge in self.postorder_edge_iter(): if edge.head_node.parent_node is None: # root: run unconstrained coalescence until just one gene node # remaining if hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size if len(contained_nodes[edge.head_node]) > 1: final = coalescent.coalesce( nodes=contained_nodes[edge.head_node], pop_size=pop_size, period=None, rng=rng) else: final = contained_nodes[edge.head_node] else: # run until next coalescence event, as determined by this edge # size. if hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size remaining = coalescent.coalesce( nodes=contained_nodes[edge.head_node], pop_size=pop_size, period=edge.length, rng=rng) try: contained_nodes[edge.tail_node].extend(remaining) except KeyError: contained_nodes[edge.tail_node] = remaining # Create and return the full tree contained_tree = dataobject.Tree(taxon_set=self.contained_taxon_set, label=label) contained_tree.seed_node = final[0] contained_tree.is_rooted = True return contained_tree
def constrained_kingman(pop_tree, gene_trees_block=None, node_factory=None, tree_factory=None, rng=None, num_genes_attr='num_genes', pop_size_attr='pop_size'): """ Given a population tree, `pop_tree` this will return a *pair of trees*: a gene tree simulated on this population tree based on Kingman's n-coalescent, and population tree with the additional attribute 'gene_nodes' on each node, which is a list of uncoalesced nodes from the gene tree associated with the given node from the population tree. `pop_tree` should be a DendroPy Tree object or an object of a class derived from this with the following attribute `num_genes` -- the number of gene samples from each population in the present. Each edge on the tree should also have the attribute `pop_size` -- the effective size of the population at this time. If `gene_trees_block` is given, then the gene tree is added to the tree block, and the tree block's taxa block will be used to manage the gene tree's `taxa`. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default if gene_trees_block is not None: gtaxa = gene_trees_block.taxa_block else: gtaxa = taxa.TaxaBlock() # we create a set of gene nodes for each leaf node on the population # tree, and associate those gene nodes to the leaf by assignment # of 'taxon'. for leaf_count, leaf in enumerate(pop_tree.leaf_iter()): gene_nodes = [] for gene_count in range(getattr(leaf, num_genes_attr)): if node_factory is not None: gene_node = node_factory() else: gene_node = trees.Node() gene_node.taxon = gtaxa.get_taxon(label=leaf.taxon.label + '_' + str(gene_count + 1)) gene_nodes.append(gene_node) leaf.gene_nodes = gene_nodes # We iterate through the edges of the population tree in post-order, # i.e., visiting child edges before we visit parent edges. For # each edge visited, we take the genes found in the child nodes, # and run the coalescent simulation on them bounded by the length # of the edge. Any genes that have not yet coalesced at the end of # this period are added to the genes of the tail (parent) node of # the edge. # start with a new (deep) copy of the population tree so as to not # to change the original tree poptree_copy = copy.deepcopy(pop_tree) # start with a new tree if tree_factory is not None: gene_tree = tree_factory.new_tree() else: gene_tree = trees.Tree() for edge in poptree_copy.postorder_edge_iter(): edge.head_node.gene_nodes = edge.head_node.gene_nodes # if mrca root, run unconstrained coalescent if edge.head_node.parent_node is None: if len(edge.head_node.gene_nodes) > 1: final = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=None, rng=rng) else: final = edge.head_node.gene_nodes gene_tree.seed_node = final[0] else: if hasattr(edge, pop_size_attr): pop_size = getattr(edge, pop_size_attr) else: # this means all our time will be in population units pop_size = 1 uncoal = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=edge.length, rng=rng) if not hasattr(edge.tail_node, 'gene_nodes'): edge.tail_node.gene_nodes = [] edge.tail_node.gene_nodes.extend(uncoal) if gene_trees_block is not None: gene_trees_block.append(gene_tree) return gene_tree, poptree_copy