def pure_kingman(taxon_set, pop_size=1, rng=None):
    Generates a tree under the unconstrained Kingman's coalescent process.

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    nodes = [dataobject.Node(taxon=t) for t in taxon_set]
    seed_node = coalescent.coalesce(nodes=nodes, pop_size=pop_size, period=None, rng=rng, use_expected_tmrca=True)[0]
    tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node)
    return tree
def mean_kingman(taxon_set, pop_size=1):
    Returns a tree with coalescent intervals given by the expected times under
    Kingman's neutral coalescent.

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    nodes = [dataobject.Node(taxon=t) for t in taxon_set]
    seed_node = coalescent.coalesce(nodes=nodes, pop_size=pop_size, period=None, rng=rng, use_expected_tmrca=True)[0]
    tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node)
    return tree
Beispiel #3
def pure_kingman(taxon_set, pop_size=1, rng=None):
    Generates a tree under the unconstrained Kingman's coalescent process.

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    nodes = [dataobject.Node(taxon=t) for t in taxon_set]
    seed_node = coalescent.coalesce(nodes=nodes,
    tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node)
    return tree
Beispiel #4
def mean_kingman(taxon_set, pop_size=1):
    Returns a tree with coalescent intervals given by the expected times under
    Kingman's neutral coalescent.

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    nodes = [dataobject.Node(taxon=t) for t in taxon_set]
    seed_node = coalescent.coalesce(nodes=nodes,
    tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node)
    return tree
def constrained_kingman(
    Given a population tree, `pop_tree` this will return a *pair of
    trees*: a gene tree simulated on this population tree based on
    Kingman's n-coalescent, and population tree with the additional
    attribute 'gene_nodes' on each node, which is a list of
    uncoalesced nodes from the gene tree associated with the given
    node from the population tree.

    `pop_tree` should be a DendroPy Tree object or an object
    of a class derived from this with the following attribute
    `num_genes` -- the number of gene samples from each population in the
    present.  Each edge on the tree should also have the attribute

    `pop_size_attr` is the attribute name of the edges of `pop_tree` that
    specify the population size. By default it is `pop_size`. The should
    specify the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals.

    If `pop_size` is 1 or 0 or None, then the edge lengths of `pop_tree` is
    taken to be in haploid population units; i.e. where 1 unit equals 2N
    generations for a diploid population of size N, or N generations for a
    haploid population of size N. Otherwise the edge lengths of `pop_tree` is
    taken to be in generations.

    If `gene_tree_list` is given, then the gene tree is added to the
    tree block, and the tree block's taxa block will be used to manage
    the gene tree's `taxa`.

    `gene_node_label_func` is a function that takes two arguments (a string
    and an integer, respectively, where the string is the containing species
    taxon label and the integer is the gene index) and returns a label for
    the corresponding the gene node.

    if `decorate_original_tree` is True, then the list of uncoalesced nodes at
    each node of the population tree is added to the original (input) population
    tree instead of a copy.

    Note that this function does very much the same thing as `contained_coalescent()`,
    but provides a very different API.

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    if gene_tree_list is not None:
        gtaxa = gene_tree_list.taxon_set
        gtaxa = dataobject.TaxonSet()

    if gene_node_label_func is None:
        gene_node_label_func = lambda x, y: "%s_%02d" % (x, y)

    # we create a set of gene nodes for each leaf node on the population
    # tree, and associate those gene nodes to the leaf by assignment
    # of 'taxon'.
    for leaf_count, leaf in enumerate(pop_tree.leaf_iter()):
        gene_nodes = []
        for gene_count in range(getattr(leaf, num_genes_attr)):
            gene_node = dataobject.Node()
            gene_node.taxon = gtaxa.require_taxon(label=gene_node_label_func(leaf.taxon.label, gene_count + 1))
        leaf.gene_nodes = gene_nodes

    # We iterate through the edges of the population tree in post-order,
    # i.e., visiting child edges before we visit parent edges. For
    # each edge visited, we take the genes found in the child nodes,
    # and run the coalescent simulation on them attacheded by the length
    # of the edge. Any genes that have not yet coalesced at the end of
    # this period are added to the genes of the tail (parent) node of
    # the edge.

    if decorate_original_tree:
        working_poptree = pop_tree
        # start with a new (deep) copy of the population tree so as to not
        # to change the original tree
        working_poptree = dataobject.Tree(pop_tree)

    # start with a new tree
    gene_tree = dataobject.Tree()
    gene_tree.taxon_set = gtaxa
    for edge in working_poptree.postorder_edge_iter():

        # if mrca root, run unconstrained coalescent
        if edge.head_node.parent_node is None:
            if len(edge.head_node.gene_nodes) > 1:
                final = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=None, rng=rng)
                final = edge.head_node.gene_nodes
            gene_tree.seed_node = final[0]

            if hasattr(edge, pop_size_attr):
                pop_size = getattr(edge, pop_size_attr)
                # this means all our time will be in population units
                pop_size = 1

            uncoal = coalescent.coalesce(
                nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=edge.length, rng=rng
            if not hasattr(edge.tail_node, "gene_nodes"):
                edge.tail_node.gene_nodes = []

    gene_tree.is_rooted = True
    if gene_tree_list is not None:
        return gene_tree, working_poptree
        return gene_tree, working_poptree
def contained_coalescent(
    containing_tree, gene_to_containing_taxon_map, edge_pop_size_attr="pop_size", default_pop_size=1, rng=None
    Returns a gene tree simulated under the coalescent contained within a
    population or species tree.

            The population or species tree. If `edge_pop_size_map` is not None,
            and population sizes given are non-trivial (i.e., >1), then edge
            lengths on this tree are in units of generations. Otherwise edge
            lengths are in population units; i.e. 2N generations for diploid
            populations of size N, or N generations for diploid populations of
            size N.

            A TaxonSetMapping object mapping Taxon objects in the
            `containing_tree` TaxonSet to corresponding Taxon objects in the
            resulting gene tree.

            Name of attribute of edges that specify population size. By default
            this is "pop_size". If this attribute does not exist,
            `default_pop_size` will be used.  The value for this attribute
            should be the haploid population size or the number of genes;
            i.e.  2N for a diploid population of N individuals, or N for a
            haploid population of N individuals. This value determines how
            branch length units are interpreted in the input tree,
            `containing_tree`.  If a biologically-meaningful value, then branch
            lengths on the `containing_tree` are properly read as generations.
            If not (e.g. 1 or 0), then they are in population units, i.e. where
            1 unit of time equals 2N generations for a diploid population of
            size N, or N generations for a haploid population of size N.
            Otherwise time is in generations. If this argument is None, then
            population sizes default to `default_pop_size`.

            Population size to use if `edge_pop_size_attr` is None or
            if an edge does not have the attribute. Defaults to 1.

    The returned gene tree will have the following extra attributes:

            A dictionary with nodes of `containing_tree` as keys and a list of gene
            tree nodes that are uncoalesced as values.

    Note that this function does very much the same thing as
    `constrained_kingman()`, but provides a very different API.

    if rng is None:
        rng = GLOBAL_RNG

    gene_tree_taxon_set = gene_to_containing_taxon_map.domain_taxon_set
    if gene_tree_taxon_set is None:
        gene_tree_taxon_set = dendropy.TaxonSet()
        for gene_taxa in pop_gene_taxa_map:
            for taxon in gene_taxa:
    gene_tree = dataobject.Tree(taxon_set=gene_tree_taxon_set)
    gene_tree.is_rooted = True

    pop_node_genes = {}
    pop_gene_taxa = gene_to_containing_taxon_map.reverse
    for nd in containing_tree.postorder_node_iter():
        if nd.taxon and nd.taxon in pop_gene_taxa:
            pop_node_genes[nd] = []
            gene_taxa = pop_gene_taxa[nd.taxon]
            for gene_taxon in gene_taxa:
                gene_node = dataobject.Node()
                gene_node.taxon = gene_taxon
            # gene_nodes = [dataobject.Node() for i in range(len(gene_taxa))]
            # for gidx, gene_node in enumerate(gene_nodes):
            #    gene_node.taxon = gene_taxa[gidx]
            #    pop_node_genes[nd].append(gene_node)

    for edge in containing_tree.postorder_edge_iter():

        if edge_pop_size_attr and hasattr(edge, edge_pop_size_attr):
            pop_size = getattr(edge, edge_pop_size_attr)
            pop_size = default_pop_size
        if edge.head_node.parent_node is None:
            if len(pop_node_genes[edge.head_node]) > 1:
                final = coalescent.coalesce(
                    nodes=pop_node_genes[edge.head_node], pop_size=default_pop_size, period=None, rng=rng
                final = pop_node_genes[edge.head_node]
            gene_tree.seed_node = final[0]
            uncoal = coalescent.coalesce(
                nodes=pop_node_genes[edge.head_node], pop_size=pop_size, period=edge.length, rng=rng
            if edge.tail_node not in pop_node_genes:
                pop_node_genes[edge.tail_node] = []

    gene_tree.pop_node_genes = pop_node_genes
    return gene_tree
Beispiel #7
def constrained_kingman(pop_tree,
    Given a population tree, `pop_tree` this will return a *pair of
    trees*: a gene tree simulated on this population tree based on
    Kingman's n-coalescent, and population tree with the additional
    attribute 'gene_nodes' on each node, which is a list of
    uncoalesced nodes from the gene tree associated with the given
    node from the population tree.

    `pop_tree` should be a DendroPy Tree object or an object
    of a class derived from this with the following attribute
    `num_genes` -- the number of gene samples from each population in the
    present.  Each edge on the tree should also have the attribute

    `pop_size_attr` is the attribute name of the edges of `pop_tree` that
    specify the population size. By default it is `pop_size`. The should
    specify the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals.

    If `pop_size` is 1 or 0 or None, then the edge lengths of `pop_tree` is
    taken to be in haploid population units; i.e. where 1 unit equals 2N
    generations for a diploid population of size N, or N generations for a
    haploid population of size N. Otherwise the edge lengths of `pop_tree` is
    taken to be in generations.

    If `gene_tree_list` is given, then the gene tree is added to the
    tree block, and the tree block's taxa block will be used to manage
    the gene tree's `taxa`.

    `gene_node_label_func` is a function that takes two arguments (a string
    and an integer, respectively, where the string is the containing species
    taxon label and the integer is the gene index) and returns a label for
    the corresponding the gene node.

    if `decorate_original_tree` is True, then the list of uncoalesced nodes at
    each node of the population tree is added to the original (input) population
    tree instead of a copy.

    Note that this function does very much the same thing as `contained_coalescent()`,
    but provides a very different API.

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    if gene_tree_list is not None:
        gtaxa = gene_tree_list.taxon_set
        gtaxa = dataobject.TaxonSet()

    if gene_node_label_func is None:
        gene_node_label_func = lambda x, y: "%s_%02d" % (x, y)

    # we create a set of gene nodes for each leaf node on the population
    # tree, and associate those gene nodes to the leaf by assignment
    # of 'taxon'.
    for leaf_count, leaf in enumerate(pop_tree.leaf_iter()):
        gene_nodes = []
        for gene_count in range(getattr(leaf, num_genes_attr)):
            gene_node = dataobject.Node()
            gene_node.taxon = gtaxa.require_taxon(
                label=gene_node_label_func(leaf.taxon.label, gene_count + 1))
        leaf.gene_nodes = gene_nodes

    # We iterate through the edges of the population tree in post-order,
    # i.e., visiting child edges before we visit parent edges. For
    # each edge visited, we take the genes found in the child nodes,
    # and run the coalescent simulation on them attacheded by the length
    # of the edge. Any genes that have not yet coalesced at the end of
    # this period are added to the genes of the tail (parent) node of
    # the edge.

    if decorate_original_tree:
        working_poptree = pop_tree
        # start with a new (deep) copy of the population tree so as to not
        # to change the original tree
        working_poptree = copy.deepcopy(pop_tree)

    # start with a new tree
    gene_tree = dataobject.Tree()
    gene_tree.taxon_set = gtaxa
    for edge in working_poptree.postorder_edge_iter():

        # if mrca root, run unconstrained coalescent
        if edge.head_node.parent_node is None:
            if len(edge.head_node.gene_nodes) > 1:
                final = coalescent.coalesce(nodes=edge.head_node.gene_nodes,
                final = edge.head_node.gene_nodes
            gene_tree.seed_node = final[0]

            if hasattr(edge, pop_size_attr):
                pop_size = getattr(edge, pop_size_attr)
                # this means all our time will be in population units
                pop_size = 1

            uncoal = coalescent.coalesce(nodes=edge.head_node.gene_nodes,
            if not hasattr(edge.tail_node, 'gene_nodes'):
                edge.tail_node.gene_nodes = []

    gene_tree.is_rooted = True
    if gene_tree_list is not None:
        return gene_tree, working_poptree
        return gene_tree, working_poptree
Beispiel #8
def contained_coalescent(containing_tree,
    Returns a gene tree simulated under the coalescent contained within a
    population or species tree.

            The population or species tree. If `edge_pop_size_map` is not None,
            and population sizes given are non-trivial (i.e., >1), then edge
            lengths on this tree are in units of generations. Otherwise edge
            lengths are in population units; i.e. 2N generations for diploid
            populations of size N, or N generations for diploid populations of
            size N.

            A TaxonSetMapping object mapping Taxon objects in the
            `containing_tree` TaxonSet to corresponding Taxon objects in the
            resulting gene tree.

            Name of attribute of edges that specify population size. By default
            this is "pop_size". If this attribute does not exist,
            `default_pop_size` will be used.  The value for this attribute
            should be the haploid population size or the number of genes;
            i.e.  2N for a diploid population of N individuals, or N for a
            haploid population of N individuals. This value determines how
            branch length units are interpreted in the input tree,
            `containing_tree`.  If a biologically-meaningful value, then branch
            lengths on the `containing_tree` are properly read as generations.
            If not (e.g. 1 or 0), then they are in population units, i.e. where
            1 unit of time equals 2N generations for a diploid population of
            size N, or N generations for a haploid population of size N.
            Otherwise time is in generations. If this argument is None, then
            population sizes default to `default_pop_size`.

            Population size to use if `edge_pop_size_attr` is None or
            if an edge does not have the attribute. Defaults to 1.

    The returned gene tree will have the following extra attributes:

            A dictionary with nodes of `containing_tree` as keys and a list of gene
            tree nodes that are uncoalesced as values.

    Note that this function does very much the same thing as
    `constrained_kingman()`, but provides a very different API.

    if rng is None:
        rng = GLOBAL_RNG

    gene_tree_taxon_set = gene_to_containing_taxon_map.domain_taxon_set
    if gene_tree_taxon_set is None:
        gene_tree_taxon_set = dendropy.TaxonSet()
        for gene_taxa in pop_gene_taxa_map:
            for taxon in gene_taxa:
    gene_tree = dataobject.Tree(taxon_set=gene_tree_taxon_set)
    gene_tree.is_rooted = True

    pop_node_genes = {}
    pop_gene_taxa = gene_to_containing_taxon_map.reverse
    for nd in containing_tree.postorder_node_iter():
        if nd.taxon and nd.taxon in pop_gene_taxa:
            pop_node_genes[nd] = []
            gene_taxa = pop_gene_taxa[nd.taxon]
            for gene_taxon in gene_taxa:
                gene_node = dataobject.Node()
                gene_node.taxon = gene_taxon
            #gene_nodes = [dataobject.Node() for i in range(len(gene_taxa))]
            #for gidx, gene_node in enumerate(gene_nodes):
            #    gene_node.taxon = gene_taxa[gidx]
            #    pop_node_genes[nd].append(gene_node)

    for edge in containing_tree.postorder_edge_iter():

        if edge_pop_size_attr and hasattr(edge, edge_pop_size_attr):
            pop_size = getattr(edge, edge_pop_size_attr)
            pop_size = default_pop_size
        if edge.head_node.parent_node is None:
            if len(pop_node_genes[edge.head_node]) > 1:
                final = coalescent.coalesce(
                final = pop_node_genes[edge.head_node]
            gene_tree.seed_node = final[0]
            uncoal = coalescent.coalesce(nodes=pop_node_genes[edge.head_node],
            if edge.tail_node not in pop_node_genes:
                pop_node_genes[edge.tail_node] = []

    gene_tree.pop_node_genes = pop_node_genes
    return gene_tree
Beispiel #9
    def simulate_contained_kingman(self,
        Simulates and returns a "censored" (Kingman) neutral coalescence tree
        conditional on self.

                Random number generator to use. If ``None``, the default will
                be used.

                Name of attribute of self's edges that specify the population
                size. If this attribute does not exist, then the population
                size is taken to be 1.

        Note that all edge-associated taxon sets must be up-to-date (otherwise,
        ``build_edge_taxa_sets()`` should be called), and that the tree
        is *not* added to the set of embedded trees. For the latter, call

        # Dictionary that maps nodes of containing tree to list of
        # corresponding nodes on gene tree, initially populated with leaf
        # nodes.
        embedded_nodes = {}
        for nd in self.leaf_iter():
            embedded_nodes[nd] = []
            for gt in nd.edge.embedded_taxa:
                gn = dataobject.Node(taxon=gt)

        # Generate the tree structure
        for edge in self.postorder_edge_iter():
            if edge.head_node.parent_node is None:
                # root: run unconstrained coalescence until just one gene node
                # remaining
                if hasattr(edge, edge_pop_size_attr):
                    pop_size = getattr(edge, edge_pop_size_attr)
                    pop_size = default_pop_size
                if len(embedded_nodes[edge.head_node]) > 1:
                    final = coalescent.coalesce(nodes=embedded_nodes[edge.head_node],
                    final = embedded_nodes[edge.head_node]
                # run until next coalescence event, as determined by this edge
                # size.
                if hasattr(edge, edge_pop_size_attr):
                    pop_size = getattr(edge, edge_pop_size_attr)
                    pop_size = default_pop_size
                remaining = coalescent.coalesce(nodes=embedded_nodes[edge.head_node],
                except KeyError:
                    embedded_nodes[edge.tail_node] = remaining

        # Create and return the full tree
        embedded_tree = dataobject.Tree(taxon_set=self.embedded_taxon_set, label=label)
        embedded_tree.seed_node = final[0]
        embedded_tree.is_rooted = True
        return embedded_tree
Beispiel #10
    def simulate_contained_kingman(self,
        Simulates and returns a "censored" (Kingman) neutral coalescence tree
        conditional on self.

                Random number generator to use. If ``None``, the default will
                be used.

                Name of attribute of self's edges that specify the population
                size. If this attribute does not exist, then the population
                size is taken to be 1.

        Note that all edge-associated taxon sets must be up-to-date (otherwise,
        ``build_edge_taxa_sets()`` should be called), and that the tree
        is *not* added to the set of contained trees. For the latter, call

        # Dictionary that maps nodes of containing tree to list of
        # corresponding nodes on gene tree, initially populated with leaf
        # nodes.
        contained_nodes = {}
        for nd in self.leaf_iter():
            contained_nodes[nd] = []
            for gt in nd.edge.contained_taxa:
                gn = dataobject.Node(taxon=gt)

        # Generate the tree structure
        for edge in self.postorder_edge_iter():
            if edge.head_node.parent_node is None:
                # root: run unconstrained coalescence until just one gene node
                # remaining
                if hasattr(edge, edge_pop_size_attr):
                    pop_size = getattr(edge, edge_pop_size_attr)
                    pop_size = default_pop_size
                if len(contained_nodes[edge.head_node]) > 1:
                    final = coalescent.coalesce(
                    final = contained_nodes[edge.head_node]
                # run until next coalescence event, as determined by this edge
                # size.
                if hasattr(edge, edge_pop_size_attr):
                    pop_size = getattr(edge, edge_pop_size_attr)
                    pop_size = default_pop_size
                remaining = coalescent.coalesce(
                except KeyError:
                    contained_nodes[edge.tail_node] = remaining

        # Create and return the full tree
        contained_tree = dataobject.Tree(taxon_set=self.contained_taxon_set,
        contained_tree.seed_node = final[0]
        contained_tree.is_rooted = True
        return contained_tree
Beispiel #11
def constrained_kingman(pop_tree,
    Given a population tree, `pop_tree` this will return a *pair of
    trees*: a gene tree simulated on this population tree based on
    Kingman's n-coalescent, and population tree with the additional
    attribute 'gene_nodes' on each node, which is a list of
    uncoalesced nodes from the gene tree associated with the given
    node from the population tree.

    `pop_tree` should be a DendroPy Tree object or an object
    of a class derived from this with the following attribute
    `num_genes` -- the number of gene samples from each population in the
    present.  Each edge on the tree should also have the attribute
    `pop_size` -- the effective size of the population at this time.
    If `gene_trees_block` is given, then the gene tree is added to the 
    tree block, and the tree block's taxa block will be used to manage
    the gene tree's `taxa`.

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    if gene_trees_block is not None:
        gtaxa = gene_trees_block.taxa_block
        gtaxa = taxa.TaxaBlock()

    # we create a set of gene nodes for each leaf node on the population
    # tree, and associate those gene nodes to the leaf by assignment
    # of 'taxon'.
    for leaf_count, leaf in enumerate(pop_tree.leaf_iter()):
        gene_nodes = []
        for gene_count in range(getattr(leaf, num_genes_attr)):
            if node_factory is not None:
                gene_node = node_factory()
                gene_node = trees.Node()
            gene_node.taxon = gtaxa.get_taxon(label=leaf.taxon.label + '_' +
                                              str(gene_count + 1))
        leaf.gene_nodes = gene_nodes

    # We iterate through the edges of the population tree in post-order,
    # i.e., visiting child edges before we visit parent edges. For
    # each edge visited, we take the genes found in the child nodes,
    # and run the coalescent simulation on them bounded by the length
    # of the edge. Any genes that have not yet coalesced at the end of
    # this period are added to the genes of the tail (parent) node of
    # the edge.

    # start with a new (deep) copy of the population tree so as to not
    # to change the original tree
    poptree_copy = copy.deepcopy(pop_tree)

    # start with a new tree
    if tree_factory is not None:
        gene_tree = tree_factory.new_tree()
        gene_tree = trees.Tree()
    for edge in poptree_copy.postorder_edge_iter():
        edge.head_node.gene_nodes = edge.head_node.gene_nodes

        # if mrca root, run unconstrained coalescent
        if edge.head_node.parent_node is None:
            if len(edge.head_node.gene_nodes) > 1:
                final = coalescent.coalesce(nodes=edge.head_node.gene_nodes,
                final = edge.head_node.gene_nodes
            gene_tree.seed_node = final[0]

            if hasattr(edge, pop_size_attr):
                pop_size = getattr(edge, pop_size_attr)
                # this means all our time will be in population units
                pop_size = 1

            uncoal = coalescent.coalesce(nodes=edge.head_node.gene_nodes,
            if not hasattr(edge.tail_node, 'gene_nodes'):
                edge.tail_node.gene_nodes = []

    if gene_trees_block is not None:

    return gene_tree, poptree_copy