def select_less_leaves(nodedata, tree): #keep=1 """Return the child node whose descendant species are the least numerous.""" # Select based on the number of species in the leaves species_counts = [] for ch,_ in nodedata: child_leaves = list(iter_leaves(tree, lambda tr,n: [c for c,_ in tr.data.get(n, [])], queue=[ch])) child_species = set(tree.info[leaf]['taxon_name'] for leaf in child_leaves) species_counts.append(len(child_species)) selected, least_species = min(enumerate(species_counts), key=lambda x: x[1]) return nodedata[selected][0], selected
def detach_toolongbranch(tree, node, maxdist=MAXDIST): n_detached = 0 n_leaves_detached = [] newdata = [] for ch, dist in tree.data.get(node, []): if dist < maxdist: newdata.append((ch, dist)) else: n_detached += 1 n_leaves_detached.append( len(list(iter_leaves(tree, get_children, [child])))) tree.data[node] = newdata return n_detached, n_leaves_detached
def prottree_extract_genecounts(proteintrees, ancestor, phyltree, speciesset=set(('H**o sapiens', )), keeponly=None): """Walk each tree to find the ancestor taxon, then output the gene counts per descendant species, as well as the list of human genes. Param: `keeponly` should be None, "stem" or "crown". """ # for myProteinTree class def get_children(tree, node): return [c for c, _ in tree.data.get(node, [])] ancestor_ancgenes = [] ancestor_genecounts = [] # In each species ancestor_spgenes = [] # For genes from speciesset. ancestors = [] # Ancestors at which we got the node. clades_before_ancestor = set( phyltree.dicLinks[phyltree.root][ancestor]) # includes anc clades_after_ancestor = phyltree.allDescendants[ancestor] # includes anc clades_outside_ancestor = ( phyltree.outgroupSpecies[ancestor] | (phyltree.getTargetsAnc("/" + ancestor) - clades_before_ancestor)) for tree in proteintrees: info = tree.info #if tree.root == 16401: # import ipdb; ipdb.set_trace() if info[tree.root]['taxon_name'] in clades_before_ancestor: for parent, node in dfw_pairs_generalized(tree, get_children, include_root=True): taxon_node = info[node]['taxon_name'] if parent is not None: taxon_parent = info[parent]['taxon_name'] isdup = (info[parent]['Duplication'] > 1) else: # then the child node (i.e the root) should be kept if it # is exactly equal to ancestor. taxon_parent = taxon_node # We know it's before (or equal). isdup = False if taxon_parent in clades_before_ancestor and \ taxon_node in clades_after_ancestor: if taxon_node != ancestor and taxon_parent != ancestor: # The branch "jumps" over this ancestral species # Process the most basal node. WHY? NO! #node = parent #taxon_node = taxon_parent pass elif taxon_node != ancestor and not isdup: # The parent is an 'ancestor' speciation so this node should already # have been taken into account. # Except if the parent is the root. #assert info[parent]['family_name'] in ancestor_ancgenes,\ # "At %d->%d: %s %s ->..." % (parent, node, # taxon_parent, # info[parent]['family_name']) # continue if keeponly == 'crown' and info[node]['Duplication'] > 1: # node is at the ancestor. # We don't want to keep it if it is a duplication. continue nodename = info[node]['family_name'] #assert nodename not in ancestor_ancgenes ancestor_ancgenes.append(nodename) ancestors.append((taxon_parent, taxon_node)) spgenes = {sp: [] for sp in speciesset} gene_counts = defaultdict(int) for leaf in iter_leaves(tree, get_children, queue=[node]): taxon_leaf = phyltree.officialName[info[leaf] ['taxon_name']] if taxon_leaf not in phyltree.listSpecies: # Error because of tree.data.pop #import ipdb; ipdb.set_trace() errmsg = "%d '%s' is not a species! (tree %d, node %d)"\ % (leaf, taxon_leaf, tree.root, node) if taxon_leaf in phyltree.allNames: if keeponly != 'stem': logger.error(errmsg) continue else: raise RuntimeError(errmsg) gene_counts[taxon_leaf] += 1 #if taxon_leaf == 'H**o sapiens': # spgenes.append(info[leaf]['gene_name']) if taxon_leaf in speciesset: spgenes[taxon_leaf].append(info[leaf]['gene_name']) ancestor_genecounts.append(gene_counts) #ancestor_spgenes.append(tuple(spgenes)) # tuple: Important for finding `()` in Series. ancestor_spgenes.append( {sp: tuple(genes) for sp, genes in spgenes.items()}) # Now do we wan't to score descendant ancestor nodes? if keeponly == 'stem': try: tree.data.pop(node) except KeyError: # This node is not in data because it is a leaf: assert node in tree.info elif taxon_node not in clades_before_ancestor: # Avoid visiting outgroups and strict ingroups. try: # I don't know how, but this messes up the dfw iteration. tree.data.pop(node) logger.debug("pop data of %d (%s)", node, taxon_node) except KeyError: #assert 'gene_name' in info[node] logger.debug("Ignore data of %d (%s)", node, taxon_node) pass # It's a leaf. assert len(ancestor_ancgenes) == len(ancestors) assert len(ancestor_ancgenes) == len(ancestor_spgenes) assert len(ancestor_ancgenes) == len(ancestor_genecounts) return ancestors, ancestor_ancgenes, ancestor_genecounts, ancestor_spgenes
def tree_detach_toolong(tree, maxdist=MAXDIST): initial_Nleaves = len(list(iter_leaves(tree, get_children))) # Contains `node_counts` for the last visited children. # Recursively updated while moving from the leaves to the root. counts_by_child = {} #detached_leaves = set() leafset = set() detached_ids = set() new_leaves = set() included_new_leaves = set() detached_subtrees = [] #logger.debug("%d/%d\n%d/%d\n%d - %d = %d\n%d - %d = %d\nlen_iter_leaves = %d", # len(tree.info), len(set(tree.info)), # len(tree.data), len(set(tree.data)), # len(tree.info), len(tree.data), len(tree.info) - len(tree.data), # len(set(tree.info)),len(set(tree.data)),len(set(tree.info)-set(tree.data)), # len(list(iter_leaves(tree, get_children)))) #logger.debug([n for n,info in tree.info.items() if (n not in tree.data and not info.get('gene_name'))]) for node_dist, children_dists in rev_dfw_descendants(tree, get_data, include_leaves=True, queue=[(tree.root, 0) ]): # (detached, included, detached_leaves, leaves) #, detached_leafset_sizes # TODO: namedtuple node_counts = [0, 0, 0, 0, []] if not children_dists: # Add 1 to the leaf count. node_counts[3] += 1 leafset.add(node_dist[0]) else: newdata = [] for child, dist in children_dists: child_counts = counts_by_child.pop(child) # Add the leaf count. node_counts[3] += child_counts[3] # Conditionally on this child being detached, update the counts. if dist >= maxdist or child in new_leaves: node_counts[0] += 1 # directly detached descendants node_counts[1] += sum( child_counts[:2]) # detached included node_counts[2] += child_counts[ 3] # child_leaves -> detached_leaves node_counts[4].append( child_counts[3]) # detached_leafset_size detached_ids.add(child) if child in new_leaves: new_leaves.remove(child) included_new_leaves.add(child) if dist >= maxdist: continue # Do not yield this subtree, go to next child. detached_data = {} detached_info = {} # This edits the tree inplace, but the outer iteration # does not change because it is already precomputed # (rev_dfw_ uses a list). for (n, d), ndata in dfw_descendants_generalized( tree, get_data, include_leaves=True, queue=[(child, 0)]): detached_info[n] = tree.info.pop(n) #.pop if ndata: detached_data[n] = ndata detached_subtrees.append( myProteinTree.ProteinTree(detached_data, detached_info, child)) ### TODO: add a 'tree_name' at the subtree.info[subtree.root] ### TODO: add a new suffix to the family_name ### (pour ne pas faire planter prune2family qui pourrait ### sortir plusieurs subtrees avec le même nom). else: node_counts[0] += child_counts[0] node_counts[1] += child_counts[1] node_counts[2] += child_counts[2] node_counts[4].extend(child_counts[4]) newdata.append((child, dist)) #if len(tree.data[node_dist[0]]) != len(newdata): #if any((x!=y) for x,y in zip(sorted(tree.data[node_dist[0]]), # sorted(newdata))): #import ipdb; ipdb.set_trace() if not newdata: logger.warning( "All children detached at node %d from tree %d.", node_dist[0], tree.root) new_leaves.add(node_dist[0]) ### DO NOT output this tree if the root supports no extant species. tree.data[node_dist[0]] = newdata counts_by_child[node_dist[0]] = node_counts assert len(counts_by_child) == 1 root_counts = counts_by_child[tree.root] #DEBUG #import ipdb; ipdb.set_trace() #logger.debug("%d/%d\n%d/%d\n%d - %d = %d\n%d - %d = %d\nlen_iter_leaves = %d", # len(tree.info), len(set(tree.info)), # len(tree.data), len(set(tree.data)), # len(tree.info), len(tree.data), len(tree.info) - len(tree.data), # len(set(tree.info)),len(set(tree.data)),len(set(tree.info)-set(tree.data)), # len(list(iter_leaves(tree, get_children)))) logger.debug("Tree %d: %s,\ndeleted ids: %s", tree.root, root_counts, ' '.join(str(i) for i in detached_ids)) root_Nleaves = len(list(iter_leaves(tree, get_children, [tree.root]))) assert len(leafset) == initial_Nleaves, \ "leafset %d != initial_Nleaves %d" %(len(leafset), initial_Nleaves) assert len(leafset) == root_Nleaves - len(new_leaves) + root_counts[2], \ "Tree %d: leafset %d != %d root_Nleaves - %d new_leaves + %d detached_leaves" %\ (tree.root, len(leafset), root_Nleaves, len(new_leaves), root_counts[2]) assert initial_Nleaves == root_counts[3], "%d != %d, %s" % ( initial_Nleaves, root_counts[3], root_counts) assert len(root_counts[4]) == root_counts[0] if tree.root in new_leaves: # We should not output this tree: mark it: logger.warning("The root %d does not support any tree.", tree.root) tree.root = None return root_counts, detached_subtrees
def edit_from_selection(proteintrees, badnodes): """Keep only one child, the one leading to the closest leaves.""" n_edits = 0 n_included = 0 # Number of nodes not edited, because already in a larger # edited subtree n_morethan2 = 0 # Number of nodes with >2 leaves for tree in proteintrees: tree_badnodes = badnodes.intersection(tree.data) if not tree_badnodes: logger.debug("No nodes to remove.") yield False, tree continue if tree.root in tree_badnodes: logger.error("Root node %d listed as a node to remove!!!", tree.root) raise NotImplementedError("Root node %d marked for removal: " "don't know what to do." % tree.root) # First the bad nodes must be sorted according to their rootwardness. # (in order to edit only the most basal in case of inclusion, and # ignore the included ones) badnode_leaves = [(badnode, set(iter_leaves(tree, get_children, [badnode]))) for badnode in tree_badnodes] edited_leafsets = [] # Indices in the above list where the leafset size decreases larger_size_i = 0 # Current size size = len(badnode_leaves[0][1]) for badnode, leaves in sorted(badnode_leaves, key=lambda x: len(x[1]), reverse=True): # Check leaf number: then only check intersection with strictly larger sets. new_size = len(leaves) if new_size < size: #logger.debug("Size decrease!") larger_size_i = len(edited_leafsets) size = new_size # Edit only if no ancestral node has already been edited. if not any((leaves & edited_l) for edited_l in edited_leafsets[:larger_size_i]): edited_leafsets.append(leaves) keep_closest_leaf(tree, badnode, indicator=True) n_edits += 1 else: n_included += 1 assert size == 2 n_morethan2 += larger_size_i badnodes.difference_update(tree_badnodes) logger.debug( "Tree: %d: %d edited nodes; %d implicitely edited; %d with >2 leaves", tree.root, n_edits, n_included, n_morethan2) yield True, tree if badnodes: logger.warning("%d nodes not found: %s", len(badnodes), ' '.join(str(n) for n in badnodes)) logger.info( "\n %9d edited nodes\n" " +%8d implicitely edited\n" " %9d with >2 leaves", n_edits, n_included, n_morethan2)
def fuse_subspecies(forest, species2seq, delete_distant_orthologs=False): n_fused = 0 n_2ch = 0 n_1ch = 0 n_single = 0 # Sequence from the given redundant set, without ortholog in the tree. n_separated = 0 # Do not share a MRCA in the given species with its apparent orthologs. for tree in forest: info = tree.info data = tree.data # Backup 'tree_name' in case the root is deleted #tree_name = tree.info[tree.root]['tree_name'] kept_children = set() # Check variable. removed_children = set() edited_parents = set() for (parent,dist), childrendists in dfw_descendants_generalized(tree, get_data, queue=[(tree.root, 0)]): if info[parent]['Duplication'] != 0: continue parent_taxon = info[parent]['taxon_name'] if parent_taxon in species2seq: assert 'gene_name' not in info[parent] data.pop(parent) assert len(childrendists) <= len(species2seq[parent_taxon]), \ "Too many descendant sequences at node %d (%s)" \ % (parent, parent_taxon) if len(childrendists) > 1: # Test each sequence start, and quit at first match. for seqstart in species2seq[parent_taxon]: for ch_i, (ch, chdist) in enumerate(childrendists): gene_names, gene_dists = zip(*( (info[tip]['gene_name'], gdist) for tip, gdist in iter_distleaves(tree, get_data, root=ch) )) if len(gene_names) > 1: assert info[ch]['Duplication'] != 0 if gene_names[0].startswith(seqstart): # No inner speciation should be possible. assert all(gn.startswith(seqstart) for gn in gene_names) break else: raise ValueError("%s not matched by any of %s" % \ (gene_names, species2seq[parent_taxon])) #except KeyError as err: # err.args += ("at %s" % [ch for ch,_ in childrendists],) # raise kept_ch, kept_dist = ch, chdist removed_ch = [c for c,_ in (childrendists[:ch_i] + childrendists[(i+1):])] for rc in removed_ch: info.pop(rc) rdat = data.pop(rc, None) if rdat: logger.warning('Removed child %d had descendants: %s', rc, rdat) removed_children.update(removed_ch) n_2ch += 1 else: kept_ch, kept_dist = childrendists[0] n_1ch += 1 if kept_dist > 0: logger.warning("%d %r sequence distance > 0!", kept_ch, info[kept_ch].get('gene_name')) #if 850 in removed_children: # import ipdb; ipdb.set_trace() # Replace with the correct child. info[parent].update(info.pop(kept_ch)) # So that 'tree_name' or 'Bootstrap' fields are conserved. try: data[parent] = data.pop(kept_ch) except KeyError as err: #if err.args[0] != kept_ch: # err.args += ('parent = %s ; kept_ch = %s' % (parent, kept_ch),) # raise pass #info[parent].update(taxon_name=parent_taxon) kept_children.add(kept_ch) edited_parents.add(parent) n_fused += 1 else: for i, (ch,_) in enumerate(childrendists): for sp, spseqs in species2seq.items(): for j, seq in enumerate(spseqs): if info[ch].get('gene_name', '').startswith(seq): logger.warning("Unexpected parent: %d (%s) at leaf %d %s", parent, parent_taxon, ch, info[ch]['gene_name']) break else: continue # If match, Need to check that there is no sister # sequence in the other child sister_childrendists = childrendists[:i] + childrendists[(i+1):] sister_seqs = spseqs[:j] + spseqs[(j+1):] has_orthologs = 0 for sister_ch,_ in sister_childrendists: sister_matched = [info[tip]['gene_name'] for tip in iter_leaves(tree, get_children, [sister_ch]) for sseq in sister_seqs if info[tip]['gene_name'].startswith(sseq)] if sister_matched: has_orthologs += len(sister_matched) logger.warning('Potential subspecies orthologs found: ' '%s', sister_matched) # But exclude those if we find paralogs paralogs = [info[tip]['gene_name'] for tip in iter_leaves(tree, get_children, [ch]) if info[tip]['gene_name'].startswith(seq)] if has_orthologs and not paralogs: n_separated += 1 # Comment: there was none of those in Ensembl 93. else: n_single += 1 assert not removed_children.intersection(info) assert not removed_children.intersection(data) assert not kept_children.intersection(info) assert not kept_children.intersection(data) assert len(edited_parents) == len(edited_parents.intersection(info)) yield tree logger.info("\n%d fusions (%d from >2 sequences, %d from 1 sequence).\n" "%d singles (only one of the two subspecies was found)\n" "%d separated (1 or more distant orthologs in the sister " "subspecies were found)", n_fused, n_2ch, n_1ch, n_single, n_separated)