Ejemplo n.º 1
0
def monophyletic_partition_discordance(tree, taxon_set_partition):
    """
    Returns the number of deep coalescences on tree `tree` that would result
    if the taxa in `tax_sets` formed K mutually-exclusive monophyletic groups,
    where K = len(tax_sets)
    `taxon_set_partition` == TaxonSetPartition
    """

    tax_sets = taxon_set_partition.subsets()
    dc_tree = dataobject.Tree()
    dc_tree.taxon_set = dataobject.TaxonSet()

    for t in range(len(tax_sets)):
        dc_tree.taxon_set.append(dataobject.Taxon(label=str(t)))

    def _get_dc_taxon(nd):
        for idx, tax_set in enumerate(tax_sets):
            if nd.taxon in tax_set:
                return dc_tree.taxon_set[idx]
        assert "taxon not found in partition: '%s'" % nd.taxon.label

    src_dc_map = {}
    for snd in tree.postorder_node_iter():
        nnd = dataobject.Node()
        src_dc_map[snd] = nnd
        children = snd.child_nodes()
        if len(children) == 0:
            nnd.taxon = _get_dc_taxon(snd)
        else:
            taxa_set = []
            for cnd in children:
                dc_node = src_dc_map[cnd]
                if len(dc_node.child_nodes()) > 1:
                    nnd.add_child(dc_node)
                else:
                    ctax = dc_node.taxon
                    if ctax is not None and ctax not in taxa_set:
                        taxa_set.append(ctax)
                    del src_dc_map[cnd]
            if len(taxa_set) > 1:
                for t in taxa_set:
                    cnd = dataobject.Node()
                    cnd.taxon = t
                    nnd.add_child(cnd)
            else:
                if len(nnd.child_nodes()) == 0:
                    nnd.taxon = taxa_set[0]
                elif len(taxa_set) == 1:
                    cnd = dataobject.Node()
                    cnd.taxon = taxa_set[0]
                    nnd.add_child(cnd)
    dc_tree.seed_node = nnd
    return len(dc_tree.leaf_nodes()) - len(tax_sets)
Ejemplo n.º 2
0
def pure_kingman(taxon_set, pop_size=1, rng=None):
    """
    Generates a tree under the unconstrained Kingman's coalescent process.
    """

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    nodes = [dataobject.Node(taxon=t) for t in taxon_set]
    seed_node = coalescent.coalesce(nodes=nodes,
                                    pop_size=pop_size,
                                    period=None,
                                    rng=rng)[0]
    tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node)
    return tree
Ejemplo n.º 3
0
def mean_kingman(taxon_set, pop_size=1):
    """
    Returns a tree with coalescent intervals given by the expected times under
    Kingman's neutral coalescent.
    """

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    nodes = [dataobject.Node(taxon=t) for t in taxon_set]
    seed_node = coalescent.coalesce(nodes=nodes,
                                    pop_size=pop_size,
                                    period=None,
                                    rng=rng,
                                    use_expected_tmrca=True)[0]
    tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node)
    return tree
Ejemplo n.º 4
0
def constrained_kingman(pop_tree,
                        gene_tree_list=None,
                        rng=None,
                        gene_node_label_func=None,
                        num_genes_attr='num_genes',
                        pop_size_attr='pop_size',
                        decorate_original_tree=False):
    """
    Given a population tree, `pop_tree` this will return a *pair of
    trees*: a gene tree simulated on this population tree based on
    Kingman's n-coalescent, and population tree with the additional
    attribute 'gene_nodes' on each node, which is a list of
    uncoalesced nodes from the gene tree associated with the given
    node from the population tree.

    `pop_tree` should be a DendroPy Tree object or an object
    of a class derived from this with the following attribute
    `num_genes` -- the number of gene samples from each population in the
    present.  Each edge on the tree should also have the attribute

    `pop_size_attr` is the attribute name of the edges of `pop_tree` that
    specify the population size. By default it is `pop_size`. The should
    specify the effective *haploid* population size; i.e., number of gene
    in the population: 2 * N in a diploid population of N individuals,
    or N in a haploid population of N individuals.

    If `pop_size` is 1 or 0 or None, then the edge lengths of `pop_tree` is
    taken to be in haploid population units; i.e. where 1 unit equals 2N
    generations for a diploid population of size N, or N generations for a
    haploid population of size N. Otherwise the edge lengths of `pop_tree` is
    taken to be in generations.

    If `gene_tree_list` is given, then the gene tree is added to the
    tree block, and the tree block's taxa block will be used to manage
    the gene tree's `taxa`.

    `gene_node_label_func` is a function that takes two arguments (a string
    and an integer, respectively, where the string is the containing species
    taxon label and the integer is the gene index) and returns a label for
    the corresponding the gene node.

    if `decorate_original_tree` is True, then the list of uncoalesced nodes at
    each node of the population tree is added to the original (input) population
    tree instead of a copy.

    Note that this function does very much the same thing as `contained_coalescent()`,
    but provides a very different API.
    """

    # get our random number generator
    if rng is None:
        rng = GLOBAL_RNG  # use the global rng by default

    if gene_tree_list is not None:
        gtaxa = gene_tree_list.taxon_set
    else:
        gtaxa = dataobject.TaxonSet()

    if gene_node_label_func is None:
        gene_node_label_func = lambda x, y: "%s_%02d" % (x, y)

    # we create a set of gene nodes for each leaf node on the population
    # tree, and associate those gene nodes to the leaf by assignment
    # of 'taxon'.
    for leaf_count, leaf in enumerate(pop_tree.leaf_iter()):
        gene_nodes = []
        for gene_count in range(getattr(leaf, num_genes_attr)):
            gene_node = dataobject.Node()
            gene_node.taxon = gtaxa.require_taxon(
                label=gene_node_label_func(leaf.taxon.label, gene_count + 1))
            gene_nodes.append(gene_node)
        leaf.gene_nodes = gene_nodes

    # We iterate through the edges of the population tree in post-order,
    # i.e., visiting child edges before we visit parent edges. For
    # each edge visited, we take the genes found in the child nodes,
    # and run the coalescent simulation on them attacheded by the length
    # of the edge. Any genes that have not yet coalesced at the end of
    # this period are added to the genes of the tail (parent) node of
    # the edge.

    if decorate_original_tree:
        working_poptree = pop_tree
    else:
        # start with a new (deep) copy of the population tree so as to not
        # to change the original tree
        working_poptree = copy.deepcopy(pop_tree)

    # start with a new tree
    gene_tree = dataobject.Tree()
    gene_tree.taxon_set = gtaxa
    for edge in working_poptree.postorder_edge_iter():

        # if mrca root, run unconstrained coalescent
        if edge.head_node.parent_node is None:
            if len(edge.head_node.gene_nodes) > 1:
                final = coalescent.coalesce(nodes=edge.head_node.gene_nodes,
                                            pop_size=pop_size,
                                            period=None,
                                            rng=rng)
            else:
                final = edge.head_node.gene_nodes
            gene_tree.seed_node = final[0]
        else:

            if hasattr(edge, pop_size_attr):
                pop_size = getattr(edge, pop_size_attr)
            else:
                # this means all our time will be in population units
                pop_size = 1

            uncoal = coalescent.coalesce(nodes=edge.head_node.gene_nodes,
                                         pop_size=pop_size,
                                         period=edge.length,
                                         rng=rng)
            if not hasattr(edge.tail_node, 'gene_nodes'):
                edge.tail_node.gene_nodes = []
            edge.tail_node.gene_nodes.extend(uncoal)

    gene_tree.is_rooted = True
    if gene_tree_list is not None:
        gene_tree_list.append(gene_tree)
        return gene_tree, working_poptree
    else:
        return gene_tree, working_poptree
Ejemplo n.º 5
0
def contained_coalescent(containing_tree,
                         gene_to_containing_taxon_map,
                         edge_pop_size_attr="pop_size",
                         default_pop_size=1,
                         rng=None):
    """
    Returns a gene tree simulated under the coalescent contained within a
    population or species tree.

        `containing_tree`
            The population or species tree. If `edge_pop_size_map` is not None,
            and population sizes given are non-trivial (i.e., >1), then edge
            lengths on this tree are in units of generations. Otherwise edge
            lengths are in population units; i.e. 2N generations for diploid
            populations of size N, or N generations for diploid populations of
            size N.

        `gene_to_containing_taxon_map`
            A TaxonSetMapping object mapping Taxon objects in the
            `containing_tree` TaxonSet to corresponding Taxon objects in the
            resulting gene tree.

        `edge_pop_size_attr`
            Name of attribute of edges that specify population size. By default
            this is "pop_size". If this attribute does not exist,
            `default_pop_size` will be used.  The value for this attribute
            should be the haploid population size or the number of genes;
            i.e.  2N for a diploid population of N individuals, or N for a
            haploid population of N individuals. This value determines how
            branch length units are interpreted in the input tree,
            `containing_tree`.  If a biologically-meaningful value, then branch
            lengths on the `containing_tree` are properly read as generations.
            If not (e.g. 1 or 0), then they are in population units, i.e. where
            1 unit of time equals 2N generations for a diploid population of
            size N, or N generations for a haploid population of size N.
            Otherwise time is in generations. If this argument is None, then
            population sizes default to `default_pop_size`.

        `default_pop_size`
            Population size to use if `edge_pop_size_attr` is None or
            if an edge does not have the attribute. Defaults to 1.

    The returned gene tree will have the following extra attributes:

        `pop_node_genes`
            A dictionary with nodes of `containing_tree` as keys and a list of gene
            tree nodes that are uncoalesced as values.

    Note that this function does very much the same thing as
    `constrained_kingman()`, but provides a very different API.
    """

    if rng is None:
        rng = GLOBAL_RNG

    gene_tree_taxon_set = gene_to_containing_taxon_map.domain_taxon_set
    if gene_tree_taxon_set is None:
        gene_tree_taxon_set = dendropy.TaxonSet()
        for gene_taxa in pop_gene_taxa_map:
            for taxon in gene_taxa:
                gene_tree_taxon_set.add(taxon)
    gene_tree = dataobject.Tree(taxon_set=gene_tree_taxon_set)
    gene_tree.is_rooted = True

    pop_node_genes = {}
    pop_gene_taxa = gene_to_containing_taxon_map.reverse
    for nd in containing_tree.postorder_node_iter():
        if nd.taxon and nd.taxon in pop_gene_taxa:
            pop_node_genes[nd] = []
            gene_taxa = pop_gene_taxa[nd.taxon]
            for gene_taxon in gene_taxa:
                gene_node = dataobject.Node()
                gene_node.taxon = gene_taxon
                pop_node_genes[nd].append(gene_node)
            #gene_nodes = [dataobject.Node() for i in range(len(gene_taxa))]
            #for gidx, gene_node in enumerate(gene_nodes):
            #    gene_node.taxon = gene_taxa[gidx]
            #    pop_node_genes[nd].append(gene_node)

    for edge in containing_tree.postorder_edge_iter():

        if edge_pop_size_attr and hasattr(edge, edge_pop_size_attr):
            pop_size = getattr(edge, edge_pop_size_attr)
        else:
            pop_size = default_pop_size
        if edge.head_node.parent_node is None:
            if len(pop_node_genes[edge.head_node]) > 1:
                final = coalescent.coalesce(
                    nodes=pop_node_genes[edge.head_node],
                    pop_size=default_pop_size,
                    period=None,
                    rng=rng)
            else:
                final = pop_node_genes[edge.head_node]
            gene_tree.seed_node = final[0]
        else:
            uncoal = coalescent.coalesce(nodes=pop_node_genes[edge.head_node],
                                         pop_size=pop_size,
                                         period=edge.length,
                                         rng=rng)
            if edge.tail_node not in pop_node_genes:
                pop_node_genes[edge.tail_node] = []
            pop_node_genes[edge.tail_node].extend(uncoal)

    gene_tree.pop_node_genes = pop_node_genes
    return gene_tree
Ejemplo n.º 6
0
def tree_from_token_stream(stream_tokenizer, **kwargs):
    """
    Processes a (SINGLE) TREE statement. Assumes that the input stream is
    located at the beginning of the statement (i.e., the first non-comment
    token should be the opening parenthesis of the tree definition).

    str_to_taxon kwarg (if used) must supply the StrToTaxon interface).
    """
    translate_dict = kwargs.get("translate_dict", None)
    encode_splits = kwargs.get("encode_splits", False)
    rooting_interpreter = kwargs.get("rooting_interpreter",
                                     RootingInterpreter(**kwargs))
    finish_node_func = kwargs.get("finish_node_func", None)
    edge_len_type = kwargs.get("edge_len_type", float)
    taxon_set = kwargs.get("taxon_set", None)
    suppress_internal_node_taxa = kwargs.get("suppress_internal_node_taxa",
                                             False)
    store_tree_weights = kwargs.get("store_tree_weights", False)
    extract_comment_metadata = kwargs.get('extract_comment_metadata', False)
    case_sensitive_taxon_labels = kwargs.get('case_sensitive_taxon_labels',
                                             False)
    allow_repeated_use = kwargs.get('allow_repeated_use', False)
    stream_tokenizer_extract_comment_metadata_setting = stream_tokenizer.extract_comment_metadata
    stream_tokenizer.extract_comment_metadata = extract_comment_metadata
    if taxon_set is None:
        taxon_set = dataobject.TaxonSet()
    tree = dataobject.Tree(taxon_set=taxon_set)

    stream_tokenizer.tree_rooting_comment = None  # clear previous comment
    stream_tokenizer.clear_comment_metadata()
    token = stream_tokenizer.read_next_token()
    if not token:
        return None
    tree.is_rooted = rooting_interpreter.interpret_as_rooted(
        stream_tokenizer.tree_rooting_comment)
    #    if stream_tokenizer.tree_rooting_comment is not None:
    #        tree.is_rooted = rooting_interpreter.interpret_as_rooted(stream_tokenizer.tree_rooting_comment)
    #    elif rooting_interpreter.interpret_as_rooted(stream_tokenizer.tree_rooting_comment):
    #        tree_is_rooted = True

    if store_tree_weights and stream_tokenizer.tree_weight_comment is not None:
        try:
            weight_expression = stream_tokenizer.tree_weight_comment.split(
                ' ')[1]
            tree.weight = eval("/".join(
                ["float(%s)" % cv for cv in weight_expression.split('/')]))
        except IndexError:
            pass
        except ValueError:
            pass
        stream_tokenizer.tree_weight_comment = None

    if encode_splits:
        if len(taxon_set) == 0:
            raise Exception("When encoding splits on a tree as it is being parsed, a "
                + "fully pre-populated TaxonSet object must be specified using the 'taxon_set' keyword " \
                + "to avoid taxon/split bitmask values changing as new Taxon objects are created " \
                + "and added to the TaxonSet.")
        if tree.is_rooted:
            tree.split_edges = {}
        else:
            atb = taxon_set.all_taxa_bitmask()
            d = containers.NormalizedBitmaskDict(mask=atb)
            tree.split_edges = d
        split_map = tree.split_edges

    stt = kwargs.get('str_to_taxon')
    if stt is None:
        stt = StrToTaxon(taxon_set,
                         translate_dict,
                         allow_repeated_use=allow_repeated_use,
                         case_sensitive=case_sensitive_taxon_labels)

    tree.seed_node = dataobject.Node()
    curr_node = tree.seed_node
    if encode_splits:
        curr_node.edge.split_bitmask = 0L

    ### NHX format support ###
    def store_node_comments(active_node):
        if stream_tokenizer.comments:
            active_node.comments.extend(stream_tokenizer.comments)

    def store_comment_metadata(target):
        if extract_comment_metadata:
            if stream_tokenizer.has_comment_metadata():
                comment_metadata = stream_tokenizer.comment_metadata
                try:
                    target.comment_metadata.update(comment_metadata)
                except AttributeError:
                    target.comment_metadata = comment_metadata
                stream_tokenizer.clear_comment_metadata()
            elif not hasattr(target, "comment_metadata"):
                target.comment_metadata = {}

    # store and clear comments
    tree.comments = stream_tokenizer.comments
    stream_tokenizer.clear_comments()
    store_comment_metadata(tree)

    while True:
        if not token or token == ';':
            if curr_node is not tree.seed_node:
                raise stream_tokenizer.data_format_error(
                    "Unbalanced parentheses -- not enough ')' characters found in tree description"
                )
            if encode_splits:
                split_map[curr_node.edge.split_bitmask] = curr_node.edge
            break
        if token == '(':
            if not curr_node.parent_node:
                if curr_node.child_nodes():
                    raise stream_tokenizer.data_format_error(
                        "Unexpected '(' after the tree description.  Expecting a label for the root or a ;"
                    )
            tmp_node = dataobject.Node()
            if encode_splits:
                tmp_node.edge.split_bitmask = 0L
            curr_node.add_child(tmp_node)
            curr_node = tmp_node
            token = stream_tokenizer.read_next_token()
            store_node_comments(curr_node)
            store_comment_metadata(curr_node)
        elif token == ',':
            tmp_node = dataobject.Node()
            if curr_node.is_leaf() and not curr_node.taxon:
                #                 curr_node.taxon = taxon_set.Taxon(oid="UNAMED_" + str(id(curr_node)), label='')
                #                 taxon_set.add(curr_node.taxon)
                raise stream_tokenizer.data_format_error(
                    "Missing taxon specifier in a tree -- found either a '(,' or ',,' construct."
                )
            p = curr_node.parent_node
            if not p:
                raise stream_tokenizer.data_format_error(
                    "Comma found one the 'outside' of a newick tree description"
                )
            if encode_splits:
                tmp_node.edge.split_bitmask = 0L
                e = curr_node.edge
                u = e.split_bitmask
                split_map[u] = e
                p.edge.split_bitmask |= u
            if finish_node_func is not None:
                finish_node_func(curr_node, tree)
            p.add_child(tmp_node)
            curr_node = tmp_node
            token = stream_tokenizer.read_next_token()
            store_node_comments(curr_node)
            store_comment_metadata(curr_node)
        else:
            if token == ')':
                if curr_node.is_leaf() and not curr_node.taxon:
                    raise stream_tokenizer.data_format_error(
                        "Missing taxon specifier in a tree -- found either a '(,' or ',,' construct."
                    )
                p = curr_node.parent_node
                if not p:
                    raise stream_tokenizer.data_format_error(
                        "Unbalanced parentheses -- too many ')' characters found in tree description"
                    )
                if encode_splits:
                    e = curr_node.edge
                    u = e.split_bitmask
                    p.edge.split_bitmask |= u
                    split_map[u] = curr_node.edge
                if finish_node_func is not None:
                    finish_node_func(curr_node, tree)
                curr_node = p
            else:
                is_leaf = curr_node.is_leaf()
                if is_leaf:
                    if curr_node.taxon:
                        raise stream_tokenizer.data_format_error(
                            "Multiple labels found for the same leaf (taxon '%s' and label '%s')"
                            % (str(curr_node.taxon), token))
                    try:
                        t = stt_require_taxon(stt, label=token)
                    except StrToTaxon.MultipleTaxonUseError, e:
                        raise stream_tokenizer.data_format_error(e.msg)
                else:
                    if curr_node.label:
                        raise stream_tokenizer.data_format_error(
                            "Multiple labels found for the same leaf (taxon '%s' and label '%s')"
                            % (curr_node.label, token))
                    if suppress_internal_node_taxa:
                        t = None
                    else:
                        try:
                            t = stt.get_taxon(label=token)
                        except StrToTaxon.MultipleTaxonUseError, e:
                            raise stream_tokenizer.data_format_error(e.msg)
Ejemplo n.º 7
0
    def simulate_contained_kingman(self,
                                   edge_pop_size_attr='pop_size',
                                   default_pop_size=1,
                                   label=None,
                                   rng=None):
        """
        Simulates and returns a "censored" (Kingman) neutral coalescence tree
        conditional on self.

            ``rng``
                Random number generator to use. If ``None``, the default will
                be used.

            ``edge_pop_size_attr``
                Name of attribute of self's edges that specify the population
                size. If this attribute does not exist, then the population
                size is taken to be 1.

        Note that all edge-associated taxon sets must be up-to-date (otherwise,
        ``build_edge_taxa_sets()`` should be called), and that the tree
        is *not* added to the set of contained trees. For the latter, call
        ``embed_contained_kingman``.
        """

        # Dictionary that maps nodes of containing tree to list of
        # corresponding nodes on gene tree, initially populated with leaf
        # nodes.
        contained_nodes = {}
        for nd in self.leaf_iter():
            contained_nodes[nd] = []
            for gt in nd.edge.contained_taxa:
                gn = dataobject.Node(taxon=gt)
                contained_nodes[nd].append(gn)

        # Generate the tree structure
        for edge in self.postorder_edge_iter():
            if edge.head_node.parent_node is None:
                # root: run unconstrained coalescence until just one gene node
                # remaining
                if hasattr(edge, edge_pop_size_attr):
                    pop_size = getattr(edge, edge_pop_size_attr)
                else:
                    pop_size = default_pop_size
                if len(contained_nodes[edge.head_node]) > 1:
                    final = coalescent.coalesce(
                        nodes=contained_nodes[edge.head_node],
                        pop_size=pop_size,
                        period=None,
                        rng=rng)
                else:
                    final = contained_nodes[edge.head_node]
            else:
                # run until next coalescence event, as determined by this edge
                # size.
                if hasattr(edge, edge_pop_size_attr):
                    pop_size = getattr(edge, edge_pop_size_attr)
                else:
                    pop_size = default_pop_size
                remaining = coalescent.coalesce(
                    nodes=contained_nodes[edge.head_node],
                    pop_size=pop_size,
                    period=edge.length,
                    rng=rng)
                try:
                    contained_nodes[edge.tail_node].extend(remaining)
                except KeyError:
                    contained_nodes[edge.tail_node] = remaining

        # Create and return the full tree
        contained_tree = dataobject.Tree(taxon_set=self.contained_taxon_set,
                                         label=label)
        contained_tree.seed_node = final[0]
        contained_tree.is_rooted = True
        return contained_tree