def monophyletic_partition_discordance(tree, taxon_set_partition): """ Returns the number of deep coalescences on tree `tree` that would result if the taxa in `tax_sets` formed K mutually-exclusive monophyletic groups, where K = len(tax_sets) `taxon_set_partition` == TaxonSetPartition """ tax_sets = taxon_set_partition.subsets() dc_tree = dataobject.Tree() dc_tree.taxon_set = dataobject.TaxonSet() for t in range(len(tax_sets)): dc_tree.taxon_set.append(dataobject.Taxon(label=str(t))) def _get_dc_taxon(nd): for idx, tax_set in enumerate(tax_sets): if nd.taxon in tax_set: return dc_tree.taxon_set[idx] assert "taxon not found in partition: '%s'" % nd.taxon.label src_dc_map = {} for snd in tree.postorder_node_iter(): nnd = dataobject.Node() src_dc_map[snd] = nnd children = snd.child_nodes() if len(children) == 0: nnd.taxon = _get_dc_taxon(snd) else: taxa_set = [] for cnd in children: dc_node = src_dc_map[cnd] if len(dc_node.child_nodes()) > 1: nnd.add_child(dc_node) else: ctax = dc_node.taxon if ctax is not None and ctax not in taxa_set: taxa_set.append(ctax) del src_dc_map[cnd] if len(taxa_set) > 1: for t in taxa_set: cnd = dataobject.Node() cnd.taxon = t nnd.add_child(cnd) else: if len(nnd.child_nodes()) == 0: nnd.taxon = taxa_set[0] elif len(taxa_set) == 1: cnd = dataobject.Node() cnd.taxon = taxa_set[0] nnd.add_child(cnd) dc_tree.seed_node = nnd return len(dc_tree.leaf_nodes()) - len(tax_sets)
def pure_kingman(taxon_set, pop_size=1, rng=None): """ Generates a tree under the unconstrained Kingman's coalescent process. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default nodes = [dataobject.Node(taxon=t) for t in taxon_set] seed_node = coalescent.coalesce(nodes=nodes, pop_size=pop_size, period=None, rng=rng)[0] tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node) return tree
def mean_kingman(taxon_set, pop_size=1): """ Returns a tree with coalescent intervals given by the expected times under Kingman's neutral coalescent. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default nodes = [dataobject.Node(taxon=t) for t in taxon_set] seed_node = coalescent.coalesce(nodes=nodes, pop_size=pop_size, period=None, rng=rng, use_expected_tmrca=True)[0] tree = dataobject.Tree(taxon_set=taxon_set, seed_node=seed_node) return tree
def constrained_kingman(pop_tree, gene_tree_list=None, rng=None, gene_node_label_func=None, num_genes_attr='num_genes', pop_size_attr='pop_size', decorate_original_tree=False): """ Given a population tree, `pop_tree` this will return a *pair of trees*: a gene tree simulated on this population tree based on Kingman's n-coalescent, and population tree with the additional attribute 'gene_nodes' on each node, which is a list of uncoalesced nodes from the gene tree associated with the given node from the population tree. `pop_tree` should be a DendroPy Tree object or an object of a class derived from this with the following attribute `num_genes` -- the number of gene samples from each population in the present. Each edge on the tree should also have the attribute `pop_size_attr` is the attribute name of the edges of `pop_tree` that specify the population size. By default it is `pop_size`. The should specify the effective *haploid* population size; i.e., number of gene in the population: 2 * N in a diploid population of N individuals, or N in a haploid population of N individuals. If `pop_size` is 1 or 0 or None, then the edge lengths of `pop_tree` is taken to be in haploid population units; i.e. where 1 unit equals 2N generations for a diploid population of size N, or N generations for a haploid population of size N. Otherwise the edge lengths of `pop_tree` is taken to be in generations. If `gene_tree_list` is given, then the gene tree is added to the tree block, and the tree block's taxa block will be used to manage the gene tree's `taxa`. `gene_node_label_func` is a function that takes two arguments (a string and an integer, respectively, where the string is the containing species taxon label and the integer is the gene index) and returns a label for the corresponding the gene node. if `decorate_original_tree` is True, then the list of uncoalesced nodes at each node of the population tree is added to the original (input) population tree instead of a copy. Note that this function does very much the same thing as `contained_coalescent()`, but provides a very different API. """ # get our random number generator if rng is None: rng = GLOBAL_RNG # use the global rng by default if gene_tree_list is not None: gtaxa = gene_tree_list.taxon_set else: gtaxa = dataobject.TaxonSet() if gene_node_label_func is None: gene_node_label_func = lambda x, y: "%s_%02d" % (x, y) # we create a set of gene nodes for each leaf node on the population # tree, and associate those gene nodes to the leaf by assignment # of 'taxon'. for leaf_count, leaf in enumerate(pop_tree.leaf_iter()): gene_nodes = [] for gene_count in range(getattr(leaf, num_genes_attr)): gene_node = dataobject.Node() gene_node.taxon = gtaxa.require_taxon( label=gene_node_label_func(leaf.taxon.label, gene_count + 1)) gene_nodes.append(gene_node) leaf.gene_nodes = gene_nodes # We iterate through the edges of the population tree in post-order, # i.e., visiting child edges before we visit parent edges. For # each edge visited, we take the genes found in the child nodes, # and run the coalescent simulation on them attacheded by the length # of the edge. Any genes that have not yet coalesced at the end of # this period are added to the genes of the tail (parent) node of # the edge. if decorate_original_tree: working_poptree = pop_tree else: # start with a new (deep) copy of the population tree so as to not # to change the original tree working_poptree = copy.deepcopy(pop_tree) # start with a new tree gene_tree = dataobject.Tree() gene_tree.taxon_set = gtaxa for edge in working_poptree.postorder_edge_iter(): # if mrca root, run unconstrained coalescent if edge.head_node.parent_node is None: if len(edge.head_node.gene_nodes) > 1: final = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=None, rng=rng) else: final = edge.head_node.gene_nodes gene_tree.seed_node = final[0] else: if hasattr(edge, pop_size_attr): pop_size = getattr(edge, pop_size_attr) else: # this means all our time will be in population units pop_size = 1 uncoal = coalescent.coalesce(nodes=edge.head_node.gene_nodes, pop_size=pop_size, period=edge.length, rng=rng) if not hasattr(edge.tail_node, 'gene_nodes'): edge.tail_node.gene_nodes = [] edge.tail_node.gene_nodes.extend(uncoal) gene_tree.is_rooted = True if gene_tree_list is not None: gene_tree_list.append(gene_tree) return gene_tree, working_poptree else: return gene_tree, working_poptree
def contained_coalescent(containing_tree, gene_to_containing_taxon_map, edge_pop_size_attr="pop_size", default_pop_size=1, rng=None): """ Returns a gene tree simulated under the coalescent contained within a population or species tree. `containing_tree` The population or species tree. If `edge_pop_size_map` is not None, and population sizes given are non-trivial (i.e., >1), then edge lengths on this tree are in units of generations. Otherwise edge lengths are in population units; i.e. 2N generations for diploid populations of size N, or N generations for diploid populations of size N. `gene_to_containing_taxon_map` A TaxonSetMapping object mapping Taxon objects in the `containing_tree` TaxonSet to corresponding Taxon objects in the resulting gene tree. `edge_pop_size_attr` Name of attribute of edges that specify population size. By default this is "pop_size". If this attribute does not exist, `default_pop_size` will be used. The value for this attribute should be the haploid population size or the number of genes; i.e. 2N for a diploid population of N individuals, or N for a haploid population of N individuals. This value determines how branch length units are interpreted in the input tree, `containing_tree`. If a biologically-meaningful value, then branch lengths on the `containing_tree` are properly read as generations. If not (e.g. 1 or 0), then they are in population units, i.e. where 1 unit of time equals 2N generations for a diploid population of size N, or N generations for a haploid population of size N. Otherwise time is in generations. If this argument is None, then population sizes default to `default_pop_size`. `default_pop_size` Population size to use if `edge_pop_size_attr` is None or if an edge does not have the attribute. Defaults to 1. The returned gene tree will have the following extra attributes: `pop_node_genes` A dictionary with nodes of `containing_tree` as keys and a list of gene tree nodes that are uncoalesced as values. Note that this function does very much the same thing as `constrained_kingman()`, but provides a very different API. """ if rng is None: rng = GLOBAL_RNG gene_tree_taxon_set = gene_to_containing_taxon_map.domain_taxon_set if gene_tree_taxon_set is None: gene_tree_taxon_set = dendropy.TaxonSet() for gene_taxa in pop_gene_taxa_map: for taxon in gene_taxa: gene_tree_taxon_set.add(taxon) gene_tree = dataobject.Tree(taxon_set=gene_tree_taxon_set) gene_tree.is_rooted = True pop_node_genes = {} pop_gene_taxa = gene_to_containing_taxon_map.reverse for nd in containing_tree.postorder_node_iter(): if nd.taxon and nd.taxon in pop_gene_taxa: pop_node_genes[nd] = [] gene_taxa = pop_gene_taxa[nd.taxon] for gene_taxon in gene_taxa: gene_node = dataobject.Node() gene_node.taxon = gene_taxon pop_node_genes[nd].append(gene_node) #gene_nodes = [dataobject.Node() for i in range(len(gene_taxa))] #for gidx, gene_node in enumerate(gene_nodes): # gene_node.taxon = gene_taxa[gidx] # pop_node_genes[nd].append(gene_node) for edge in containing_tree.postorder_edge_iter(): if edge_pop_size_attr and hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size if edge.head_node.parent_node is None: if len(pop_node_genes[edge.head_node]) > 1: final = coalescent.coalesce( nodes=pop_node_genes[edge.head_node], pop_size=default_pop_size, period=None, rng=rng) else: final = pop_node_genes[edge.head_node] gene_tree.seed_node = final[0] else: uncoal = coalescent.coalesce(nodes=pop_node_genes[edge.head_node], pop_size=pop_size, period=edge.length, rng=rng) if edge.tail_node not in pop_node_genes: pop_node_genes[edge.tail_node] = [] pop_node_genes[edge.tail_node].extend(uncoal) gene_tree.pop_node_genes = pop_node_genes return gene_tree
def tree_from_token_stream(stream_tokenizer, **kwargs): """ Processes a (SINGLE) TREE statement. Assumes that the input stream is located at the beginning of the statement (i.e., the first non-comment token should be the opening parenthesis of the tree definition). str_to_taxon kwarg (if used) must supply the StrToTaxon interface). """ translate_dict = kwargs.get("translate_dict", None) encode_splits = kwargs.get("encode_splits", False) rooting_interpreter = kwargs.get("rooting_interpreter", RootingInterpreter(**kwargs)) finish_node_func = kwargs.get("finish_node_func", None) edge_len_type = kwargs.get("edge_len_type", float) taxon_set = kwargs.get("taxon_set", None) suppress_internal_node_taxa = kwargs.get("suppress_internal_node_taxa", False) store_tree_weights = kwargs.get("store_tree_weights", False) extract_comment_metadata = kwargs.get('extract_comment_metadata', False) case_sensitive_taxon_labels = kwargs.get('case_sensitive_taxon_labels', False) allow_repeated_use = kwargs.get('allow_repeated_use', False) stream_tokenizer_extract_comment_metadata_setting = stream_tokenizer.extract_comment_metadata stream_tokenizer.extract_comment_metadata = extract_comment_metadata if taxon_set is None: taxon_set = dataobject.TaxonSet() tree = dataobject.Tree(taxon_set=taxon_set) stream_tokenizer.tree_rooting_comment = None # clear previous comment stream_tokenizer.clear_comment_metadata() token = stream_tokenizer.read_next_token() if not token: return None tree.is_rooted = rooting_interpreter.interpret_as_rooted( stream_tokenizer.tree_rooting_comment) # if stream_tokenizer.tree_rooting_comment is not None: # tree.is_rooted = rooting_interpreter.interpret_as_rooted(stream_tokenizer.tree_rooting_comment) # elif rooting_interpreter.interpret_as_rooted(stream_tokenizer.tree_rooting_comment): # tree_is_rooted = True if store_tree_weights and stream_tokenizer.tree_weight_comment is not None: try: weight_expression = stream_tokenizer.tree_weight_comment.split( ' ')[1] tree.weight = eval("/".join( ["float(%s)" % cv for cv in weight_expression.split('/')])) except IndexError: pass except ValueError: pass stream_tokenizer.tree_weight_comment = None if encode_splits: if len(taxon_set) == 0: raise Exception("When encoding splits on a tree as it is being parsed, a " + "fully pre-populated TaxonSet object must be specified using the 'taxon_set' keyword " \ + "to avoid taxon/split bitmask values changing as new Taxon objects are created " \ + "and added to the TaxonSet.") if tree.is_rooted: tree.split_edges = {} else: atb = taxon_set.all_taxa_bitmask() d = containers.NormalizedBitmaskDict(mask=atb) tree.split_edges = d split_map = tree.split_edges stt = kwargs.get('str_to_taxon') if stt is None: stt = StrToTaxon(taxon_set, translate_dict, allow_repeated_use=allow_repeated_use, case_sensitive=case_sensitive_taxon_labels) tree.seed_node = dataobject.Node() curr_node = tree.seed_node if encode_splits: curr_node.edge.split_bitmask = 0L ### NHX format support ### def store_node_comments(active_node): if stream_tokenizer.comments: active_node.comments.extend(stream_tokenizer.comments) def store_comment_metadata(target): if extract_comment_metadata: if stream_tokenizer.has_comment_metadata(): comment_metadata = stream_tokenizer.comment_metadata try: target.comment_metadata.update(comment_metadata) except AttributeError: target.comment_metadata = comment_metadata stream_tokenizer.clear_comment_metadata() elif not hasattr(target, "comment_metadata"): target.comment_metadata = {} # store and clear comments tree.comments = stream_tokenizer.comments stream_tokenizer.clear_comments() store_comment_metadata(tree) while True: if not token or token == ';': if curr_node is not tree.seed_node: raise stream_tokenizer.data_format_error( "Unbalanced parentheses -- not enough ')' characters found in tree description" ) if encode_splits: split_map[curr_node.edge.split_bitmask] = curr_node.edge break if token == '(': if not curr_node.parent_node: if curr_node.child_nodes(): raise stream_tokenizer.data_format_error( "Unexpected '(' after the tree description. Expecting a label for the root or a ;" ) tmp_node = dataobject.Node() if encode_splits: tmp_node.edge.split_bitmask = 0L curr_node.add_child(tmp_node) curr_node = tmp_node token = stream_tokenizer.read_next_token() store_node_comments(curr_node) store_comment_metadata(curr_node) elif token == ',': tmp_node = dataobject.Node() if curr_node.is_leaf() and not curr_node.taxon: # curr_node.taxon = taxon_set.Taxon(oid="UNAMED_" + str(id(curr_node)), label='') # taxon_set.add(curr_node.taxon) raise stream_tokenizer.data_format_error( "Missing taxon specifier in a tree -- found either a '(,' or ',,' construct." ) p = curr_node.parent_node if not p: raise stream_tokenizer.data_format_error( "Comma found one the 'outside' of a newick tree description" ) if encode_splits: tmp_node.edge.split_bitmask = 0L e = curr_node.edge u = e.split_bitmask split_map[u] = e p.edge.split_bitmask |= u if finish_node_func is not None: finish_node_func(curr_node, tree) p.add_child(tmp_node) curr_node = tmp_node token = stream_tokenizer.read_next_token() store_node_comments(curr_node) store_comment_metadata(curr_node) else: if token == ')': if curr_node.is_leaf() and not curr_node.taxon: raise stream_tokenizer.data_format_error( "Missing taxon specifier in a tree -- found either a '(,' or ',,' construct." ) p = curr_node.parent_node if not p: raise stream_tokenizer.data_format_error( "Unbalanced parentheses -- too many ')' characters found in tree description" ) if encode_splits: e = curr_node.edge u = e.split_bitmask p.edge.split_bitmask |= u split_map[u] = curr_node.edge if finish_node_func is not None: finish_node_func(curr_node, tree) curr_node = p else: is_leaf = curr_node.is_leaf() if is_leaf: if curr_node.taxon: raise stream_tokenizer.data_format_error( "Multiple labels found for the same leaf (taxon '%s' and label '%s')" % (str(curr_node.taxon), token)) try: t = stt_require_taxon(stt, label=token) except StrToTaxon.MultipleTaxonUseError, e: raise stream_tokenizer.data_format_error(e.msg) else: if curr_node.label: raise stream_tokenizer.data_format_error( "Multiple labels found for the same leaf (taxon '%s' and label '%s')" % (curr_node.label, token)) if suppress_internal_node_taxa: t = None else: try: t = stt.get_taxon(label=token) except StrToTaxon.MultipleTaxonUseError, e: raise stream_tokenizer.data_format_error(e.msg)
def simulate_contained_kingman(self, edge_pop_size_attr='pop_size', default_pop_size=1, label=None, rng=None): """ Simulates and returns a "censored" (Kingman) neutral coalescence tree conditional on self. ``rng`` Random number generator to use. If ``None``, the default will be used. ``edge_pop_size_attr`` Name of attribute of self's edges that specify the population size. If this attribute does not exist, then the population size is taken to be 1. Note that all edge-associated taxon sets must be up-to-date (otherwise, ``build_edge_taxa_sets()`` should be called), and that the tree is *not* added to the set of contained trees. For the latter, call ``embed_contained_kingman``. """ # Dictionary that maps nodes of containing tree to list of # corresponding nodes on gene tree, initially populated with leaf # nodes. contained_nodes = {} for nd in self.leaf_iter(): contained_nodes[nd] = [] for gt in nd.edge.contained_taxa: gn = dataobject.Node(taxon=gt) contained_nodes[nd].append(gn) # Generate the tree structure for edge in self.postorder_edge_iter(): if edge.head_node.parent_node is None: # root: run unconstrained coalescence until just one gene node # remaining if hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size if len(contained_nodes[edge.head_node]) > 1: final = coalescent.coalesce( nodes=contained_nodes[edge.head_node], pop_size=pop_size, period=None, rng=rng) else: final = contained_nodes[edge.head_node] else: # run until next coalescence event, as determined by this edge # size. if hasattr(edge, edge_pop_size_attr): pop_size = getattr(edge, edge_pop_size_attr) else: pop_size = default_pop_size remaining = coalescent.coalesce( nodes=contained_nodes[edge.head_node], pop_size=pop_size, period=edge.length, rng=rng) try: contained_nodes[edge.tail_node].extend(remaining) except KeyError: contained_nodes[edge.tail_node] = remaining # Create and return the full tree contained_tree = dataobject.Tree(taxon_set=self.contained_taxon_set, label=label) contained_tree.seed_node = final[0] contained_tree.is_rooted = True return contained_tree