def build_subsets_tree(self, curr_tmp_dir_par): translate={} t2 = {} for node in self.tree._tree.leaf_iter(): nalsj = self.pasta_team.subsets[node.taxon.label] newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:] translate[node.taxon.label] = newname t2[newname] = set([nalsj]) subsets_tree = PhylogeneticTree(read_newick_with_translate(StringIO(self.tree_str),translate_dict=translate)) for node in subsets_tree._tree.leaf_iter(): node.alignment_subset_job = t2[node.taxon] del t2 del translate _LOG.debug("nodes labeled") #subsets_tree._tree.infer_taxa() #_LOG.debug("fake taxa inferred") #Then make sure the tree is rooted at a branch (not at a node). if len(subsets_tree._tree.seed_node.child_nodes()) > 2: subsets_tree._tree.reroot_at_edge(subsets_tree._tree.seed_node.child_nodes()[0].edge) _LOG.debug("Subset Labeling (start):\n%s" %str(subsets_tree.compose_newick())) # Then label internal branches based on their children, and collapse redundant edges. for node in subsets_tree._tree.postorder_internal_node_iter(): # my label is the intersection of my children, # unless the intersection is empty, in which case it is the union if not hasattr(node, "alignment_subset_job") or node.alignment_subset_job is None: node.alignment_subset_job = set.intersection(*[c.alignment_subset_job for c in node.child_nodes()]) if not node.alignment_subset_job: node.alignment_subset_job = set.union(*[c.alignment_subset_job for c in node.child_nodes()]) # Now go ahead and prune any child whose label encompasses my label. # Use indexing instead of iteration, because with each collapse, # new children can be added, and we want to process them as well. i = 0; while i < len(node.child_nodes()): c = node.child_nodes()[i] if node.alignment_subset_job.issubset(c.alignment_subset_job): # Dendropy does not collapsing and edge that leads to a tip. Remove instead if c.child_nodes(): c.edge.collapse() else: node.remove_child(c) else: i += 1 # Now, the remaining edges have multiple labels. These need to # be further resolved. Do it by minimum length # First find all candidate edges that we might want to contract candidate_edges = set() for e in subsets_tree._tree.postorder_edge_iter(): if e.tail_node and e.head_node.alignment_subset_job.intersection(e.tail_node.alignment_subset_job): candidate_edges.add( (e.length,e) ) # Then sort the edges, and start removing them one by one # only if an edge is still having intersecting labels at the two ends candidate_edges = sorted(candidate_edges) for (el, edge) in candidate_edges: I = edge.tail_node.alignment_subset_job.intersection(edge.head_node.alignment_subset_job) if I: edge.tail_node.alignment_subset_job = I if edge.head_node.child_nodes(): edge.collapse() else: edge.tail_node.remove_child(edge.head_node) # Make sure the tree is correct, remove the actual jobs # from nodes (can cause deep-copy problems), assign a label to each # node, and keep a mapping between the labels and actual alignment job objects self.pasta_team.subsets = {} # Let this now map from subset labels to the actual alignment jobs for node in subsets_tree._tree.postorder_node_iter(): assert len(node.alignment_subset_job) == 1 nalsj = node.alignment_subset_job.pop() node.alignment_subset_job = None node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:] self.pasta_team.subsets[node.label] = nalsj if node.is_leaf(): # Add a dummy taxon, or else dendropy can get confused node.taxon = Taxon(label=node.label) #subsets_tree._tree.infer_taxa() return subsets_tree
def build_subsets_tree(self, curr_tmp_dir_par, build_min_tree=True): # uym2 added: add option for MST if build_min_tree: _LOG.debug("START building Minimum Spanning Tree") grouping = {} groupName2jobName = {} for node in self.tree._tree.leaf_node_iter(): groupName = self.pasta_team.subsets[ node.taxon.label].tmp_dir_par[len(curr_tmp_dir_par) + 1:] grouping[node.taxon.label] = groupName.replace("/", "") groupName2jobName[groupName] = self.pasta_team.subsets[ node.taxon.label] subsets_tree = build_groups_MST(self.tree._tree, grouping) for node in subsets_tree.postorder_node_iter(): if node.is_leaf(): node.taxon.label = node.taxon.label.replace("d", "/d") node.label = node.label.replace("d", "/d") self.pasta_team.subsets = groupName2jobName MST = PhylogeneticTree(subsets_tree) _LOG.debug("Spanning tree is:\n %s" % MST) return MST ################################### _LOG.debug("START building heuristic spanning tree") translate = {} t2 = {} for node in self.tree._tree.leaf_node_iter(): nalsj = self.pasta_team.subsets[node.taxon.label] newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:] translate[node.taxon.label] = newname t2[newname] = set([nalsj]) subsets_tree = PhylogeneticTree( Tree.get(data=self.tree_str, schema='newick')) for node in subsets_tree._tree.leaf_node_iter(): node.alignment_subset_job = t2[translate[node.taxon.label]] #node.alignment_subset_job = t2[node.taxon] del t2 del translate _LOG.debug("leafs labeled") #subsets_tree._tree.infer_taxa() #_LOG.debug("fake taxa inferred") #Then make sure the tree is rooted at a branch (not at a node). if len(subsets_tree._tree.seed_node.child_nodes()) > 2: for c in subsets_tree._tree.seed_node.child_nodes(): if c.edge.is_internal(): break subsets_tree._tree.is_rooted = True subsets_tree._tree.reroot_at_edge(c.edge, length1=c.edge.length / 2., length2=c.edge.length / 2., suppress_unifurcations=False) _LOG.debug( "Subset Labeling (start):\n%s" % str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000]) #_LOG.debug("Subset Labeling (start):\n%s" %str(len(subsets_tree._tree.seed_node.child_nodes()))) # Then label internal branches based on their children, and collapse redundant edges. for node in subsets_tree._tree.postorder_internal_node_iter(): # my label is the intersection of my children, # unless the intersection is empty, in which case it is the union if not hasattr(node, "alignment_subset_job" ) or node.alignment_subset_job is None: node.alignment_subset_job = set.intersection( *[c.alignment_subset_job for c in node.child_nodes()]) if not node.alignment_subset_job: node.alignment_subset_job = set.union( *[c.alignment_subset_job for c in node.child_nodes()]) # Now go ahead and prune any child whose label encompasses my label. # Use indexing instead of iteration, because with each collapse, # new children can be added, and we want to process them as well. i = 0 while i < len(node.child_nodes()): c = node.child_nodes()[i] if node.alignment_subset_job.issubset(c.alignment_subset_job): # Dendropy does not collapsing and edge that leads to a tip. Remove instead if c.child_nodes(): c.edge.collapse() else: node.remove_child(c) else: i += 1 node.label = "+".join(nj.tmp_dir_par[len(curr_tmp_dir_par) + 1:] for nj in node.alignment_subset_job) if node.is_leaf(): node.taxon = subsets_tree._tree.taxon_namespace.new_taxon( label=node.label) _LOG.debug( "Before final round, the tree is:\n %s" % str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000]) # Now, the remaining edges have multiple labels. These need to # be further resolved. Do it by minimum length # First find all candidate edges that we might want to contract candidate_edges = set() for e in subsets_tree._tree.postorder_edge_iter(): if e.tail_node and e.head_node.alignment_subset_job.intersection( e.tail_node.alignment_subset_job): candidate_edges.add((e.length, e)) # Then sort the edges, and start removing them one by one # only if an edge is still having intersecting labels at the two ends candidate_edges = sorted(candidate_edges, key=lambda x: x[0] if x[0] else -1) for (el, edge) in candidate_edges: I = edge.tail_node.alignment_subset_job.intersection( edge.head_node.alignment_subset_job) if I: edge.tail_node.alignment_subset_job = I if edge.head_node.child_nodes(): #edge.collapse(adjust_collapsed_head_children_edge_lengths=True) edge.collapse() else: edge.tail_node.remove_child(edge.head_node) # Make sure the tree is correct, remove the actual jobs # from nodes (can cause deep-copy problems), assign a label to each # node, and keep a mapping between the labels and actual alignment job objects self.pasta_team.subsets = { } # Let this now map from subset labels to the actual alignment jobs for node in subsets_tree._tree.postorder_node_iter(): assert len(node.alignment_subset_job) == 1 nalsj = node.alignment_subset_job.pop() node.alignment_subset_job = None node.label = nalsj.tmp_dir_par[ len(curr_tmp_dir_par) + 1:] #only find last part of the name self.pasta_team.subsets[node.label] = nalsj if node.is_leaf(): # Add a dummy taxon, or else dendropy can get confused node.taxon = subsets_tree._tree.taxon_namespace.new_taxon( label=node.label) #subsets_tree._tree.infer_taxa() _LOG.debug("Spanning tree is:\n %s" % subsets_tree) labels = [nd.label for nd in subsets_tree._tree.postorder_node_iter()] if len(set(labels)) != len(labels): import collections raise Exception("Duplicate names found %s" % "\n".join( item for item, count in collections.Counter(labels).items() if count > 1)) return subsets_tree
def build_subsets_tree(self, curr_tmp_dir_par,build_min_tree=True): # uym2 added: add option for MST if build_min_tree: _LOG.debug("START building Minimum Spanning Tree") grouping = {} groupName2jobName = {} for node in self.tree._tree.leaf_node_iter(): groupName = self.pasta_team.subsets[node.taxon.label].tmp_dir_par[len(curr_tmp_dir_par)+1:] grouping[node.taxon.label] = groupName.replace("/","") groupName2jobName[groupName] = self.pasta_team.subsets[node.taxon.label] subsets_tree = build_groups_MST(self.tree._tree,grouping) for node in subsets_tree.postorder_node_iter(): if node.is_leaf(): node.taxon.label = node.taxon.label.replace("d","/d") node.label = node.label.replace("d","/d") self.pasta_team.subsets = groupName2jobName MST = PhylogeneticTree(subsets_tree) _LOG.debug("Spanning tree is:\n %s" %MST) return MST ################################### _LOG.debug("START building heuristic spanning tree") translate={} t2 = {} for node in self.tree._tree.leaf_node_iter(): nalsj = self.pasta_team.subsets[node.taxon.label] newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:] translate[node.taxon.label] = newname t2[newname] = set([nalsj]) subsets_tree = PhylogeneticTree(Tree.get(data=self.tree_str,schema='newick')) for node in subsets_tree._tree.leaf_node_iter(): node.alignment_subset_job = t2[translate[node.taxon.label]] #node.alignment_subset_job = t2[node.taxon] del t2 del translate _LOG.debug("leafs labeled") #subsets_tree._tree.infer_taxa() #_LOG.debug("fake taxa inferred") #Then make sure the tree is rooted at a branch (not at a node). if len(subsets_tree._tree.seed_node.child_nodes()) > 2: for c in subsets_tree._tree.seed_node.child_nodes(): if c.edge.is_internal(): break subsets_tree._tree.is_rooted = True subsets_tree._tree.reroot_at_edge(c.edge,length1=c.edge.length/2., length2=c.edge.length/2., suppress_unifurcations=False) _LOG.debug("Subset Labeling (start):\n%s" %str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000]) #_LOG.debug("Subset Labeling (start):\n%s" %str(len(subsets_tree._tree.seed_node.child_nodes()))) # Then label internal branches based on their children, and collapse redundant edges. for node in subsets_tree._tree.postorder_internal_node_iter(): # my label is the intersection of my children, # unless the intersection is empty, in which case it is the union if not hasattr(node, "alignment_subset_job") or node.alignment_subset_job is None: node.alignment_subset_job = set.intersection(*[c.alignment_subset_job for c in node.child_nodes()]) if not node.alignment_subset_job: node.alignment_subset_job = set.union(*[c.alignment_subset_job for c in node.child_nodes()]) # Now go ahead and prune any child whose label encompasses my label. # Use indexing instead of iteration, because with each collapse, # new children can be added, and we want to process them as well. i = 0; while i < len(node.child_nodes()): c = node.child_nodes()[i] if node.alignment_subset_job.issubset(c.alignment_subset_job): # Dendropy does not collapsing and edge that leads to a tip. Remove instead if c.child_nodes(): c.edge.collapse() else: node.remove_child(c) else: i += 1 node.label = "+".join(nj.tmp_dir_par[len(curr_tmp_dir_par)+1:] for nj in node.alignment_subset_job) if node.is_leaf(): node.taxon = subsets_tree._tree.taxon_namespace.new_taxon(label=node.label) _LOG.debug("Before final round, the tree is:\n %s" %str(subsets_tree.compose_newick(suppress_rooting=False))[0:5000]) # Now, the remaining edges have multiple labels. These need to # be further resolved. Do it by minimum length # First find all candidate edges that we might want to contract candidate_edges = set() for e in subsets_tree._tree.postorder_edge_iter(): if e.tail_node and e.head_node.alignment_subset_job.intersection(e.tail_node.alignment_subset_job): candidate_edges.add( (e.length,e) ) # Then sort the edges, and start removing them one by one # only if an edge is still having intersecting labels at the two ends candidate_edges = sorted(candidate_edges, key=lambda x: x[0] if x[0] else -1) for (el, edge) in candidate_edges: I = edge.tail_node.alignment_subset_job.intersection(edge.head_node.alignment_subset_job) if I: edge.tail_node.alignment_subset_job = I if edge.head_node.child_nodes(): #edge.collapse(adjust_collapsed_head_children_edge_lengths=True) edge.collapse() else: edge.tail_node.remove_child(edge.head_node) # Make sure the tree is correct, remove the actual jobs # from nodes (can cause deep-copy problems), assign a label to each # node, and keep a mapping between the labels and actual alignment job objects self.pasta_team.subsets = {} # Let this now map from subset labels to the actual alignment jobs for node in subsets_tree._tree.postorder_node_iter(): assert len(node.alignment_subset_job) == 1 nalsj = node.alignment_subset_job.pop() node.alignment_subset_job = None node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par)+1:]#only find last part of the name self.pasta_team.subsets[node.label] = nalsj if node.is_leaf(): # Add a dummy taxon, or else dendropy can get confused node.taxon = subsets_tree._tree.taxon_namespace.new_taxon(label=node.label) #subsets_tree._tree.infer_taxa() _LOG.debug("Spanning tree is:\n %s" %subsets_tree) labels = [nd.label for nd in subsets_tree._tree.postorder_node_iter()] if len(set(labels)) != len(labels): import collections raise Exception("Duplicate names found %s" %"\n".join (item for item, count in collections.Counter(labels).items() if count > 1)) return subsets_tree
def launch_alignment(self, context_str=None): ''' ''' if self.killed: raise RuntimeError("PastaAligner Job killed") self._reset_jobs() self.context_str = context_str if self.context_str is None: self.context_str = '' node_count = self.tree.count_nodes() _LOG.debug("Recursive merge on a branch with %d subsets" % (node_count)) prefix = "subsets tree: %s" % self.tree.compose_newick()[0:200] if node_count == 2: nodes = self.tree._tree.nodes() _LOG.debug("%s ... pairwise merge " % prefix) self.skip_merge = False self.subjob1 = self.pasta_team.subsets[nodes[0].label] self.subjob2 = self.pasta_team.subsets[nodes[1].label] self.subjob1.add_parent(self) self.add_child(self.subjob1) self.subjob2.add_parent(self) self.add_child(self.subjob2) else: _LOG.debug("%s ... recursing further " % prefix) self.skip_merge = True # Reroot near centroid edge ce = self.tree.get_centroid_edge(spanning=True) nr = ce.head_node if not ce.head_node.is_leaf() else ce.tail_node self.tree._tree.reroot_at_node(nr, suppress_unifurcations=False) _LOG.debug("rerooted to: %s ..." % self.tree.compose_newick()[0:200]) # For each path from root to its children, create a new merge job merge_job_list = [] nr = self.tree._tree.seed_node children = nr.child_nodes() for keepchild in children: remchilds = [] for remchild in children: if remchild != keepchild: remchilds.append( nr.reversible_remove_child( remchild, suppress_unifurcations=False)) t1 = PhylogeneticTree(Tree(self.tree._tree)) remchilds.reverse() for child in remchilds: nr.reinsert_nodes(child) _LOG.debug("child = %s ..." % t1.compose_newick()[0:200]) multilocus_dataset1 = self.multilocus_dataset.new_with_shared_meta( ) if t1.count_nodes() == 2: ns = t1._tree.nodes() tmp_dir_par = self.get_pairwise_temp_dir( ns[0].label, ns[1].label) else: tmp_dir_par = self.tmp_base_dir configuration = self.configuration() cj = PASTAMergerJob(multilocus_dataset=multilocus_dataset1, pasta_team=self.pasta_team, tree=t1, tmp_base_dir=self.tmp_base_dir, tmp_dir_par=tmp_dir_par, delete_temps2=False, **configuration) cj.add_parent(self) self.add_child(cj) merge_job_list.append(cj) self.merge_job_list = merge_job_list # now launch these new merge jobs for merge_job in self.merge_job_list: if self.killed: raise RuntimeError("PastaAligner Job killed") merge_job.launch_alignment() self._merge_queued_event.set() if self.killed: raise RuntimeError("PastaAligner Job killed") return
def build_subsets_tree(self, curr_tmp_dir_par): translate = {} t2 = {} for node in self.tree._tree.leaf_iter(): nalsj = self.pasta_team.subsets[node.taxon.label] newname = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:] translate[node.taxon.label] = newname t2[newname] = set([nalsj]) subsets_tree = PhylogeneticTree( read_newick_with_translate(StringIO(self.tree_str), translate_dict=translate)) for node in subsets_tree._tree.leaf_iter(): node.alignment_subset_job = t2[node.taxon] del t2 del translate _LOG.debug("nodes labeled") #subsets_tree._tree.infer_taxa() #_LOG.debug("fake taxa inferred") #Then make sure the tree is rooted at a branch (not at a node). if len(subsets_tree._tree.seed_node.child_nodes()) > 2: subsets_tree._tree.reroot_at_edge( subsets_tree._tree.seed_node.child_nodes()[0].edge) _LOG.debug("Subset Labeling (start):\n%s" % str(subsets_tree.compose_newick())) # Then label internal branches based on their children, and collapse redundant edges. for node in subsets_tree._tree.postorder_internal_node_iter(): # my label is the intersection of my children, # unless the intersection is empty, in which case it is the union if not hasattr(node, "alignment_subset_job" ) or node.alignment_subset_job is None: node.alignment_subset_job = set.intersection( *[c.alignment_subset_job for c in node.child_nodes()]) if not node.alignment_subset_job: node.alignment_subset_job = set.union( *[c.alignment_subset_job for c in node.child_nodes()]) # Now go ahead and prune any child whose label encompasses my label. # Use indexing instead of iteration, because with each collapse, # new children can be added, and we want to process them as well. i = 0 while i < len(node.child_nodes()): c = node.child_nodes()[i] if node.alignment_subset_job.issubset(c.alignment_subset_job): # Dendropy does not collapsing and edge that leads to a tip. Remove instead if c.child_nodes(): c.edge.collapse() else: node.remove_child(c) else: i += 1 # Now, the remaining edges have multiple labels. These need to # be further resolved. Do it by minimum length # First find all candidate edges that we might want to contract candidate_edges = set() for e in subsets_tree._tree.postorder_edge_iter(): if e.tail_node and e.head_node.alignment_subset_job.intersection( e.tail_node.alignment_subset_job): candidate_edges.add((e.length, e)) # Then sort the edges, and start removing them one by one # only if an edge is still having intersecting labels at the two ends candidate_edges = sorted(candidate_edges) for (el, edge) in candidate_edges: I = edge.tail_node.alignment_subset_job.intersection( edge.head_node.alignment_subset_job) if I: edge.tail_node.alignment_subset_job = I if edge.head_node.child_nodes(): edge.collapse() else: edge.tail_node.remove_child(edge.head_node) # Make sure the tree is correct, remove the actual jobs # from nodes (can cause deep-copy problems), assign a label to each # node, and keep a mapping between the labels and actual alignment job objects self.pasta_team.subsets = { } # Let this now map from subset labels to the actual alignment jobs for node in subsets_tree._tree.postorder_node_iter(): assert len(node.alignment_subset_job) == 1 nalsj = node.alignment_subset_job.pop() node.alignment_subset_job = None node.label = nalsj.tmp_dir_par[len(curr_tmp_dir_par) + 1:] self.pasta_team.subsets[node.label] = nalsj if node.is_leaf(): # Add a dummy taxon, or else dendropy can get confused node.taxon = Taxon(label=node.label) #subsets_tree._tree.infer_taxa() return subsets_tree
def launch_alignment(self, context_str=None): ''' ''' if self.killed: raise RuntimeError("PastaAligner Job killed") self._reset_jobs() self.context_str = context_str if self.context_str is None: self.context_str = '' node_count = self.tree.count_nodes() _LOG.debug("Recursive merge on a branch with %d subsets" % (node_count)) prefix = "subsets tree: %s" %self.tree.compose_newick()[0:200] if node_count == 2: nodes = self.tree._tree.nodes() _LOG.debug("%s ... pairwise merge " % prefix) self.skip_merge = False self.subjob1 = self.pasta_team.subsets[nodes[0].label] self.subjob2 = self.pasta_team.subsets[nodes[1].label] self.subjob1.add_parent(self) self.add_child(self.subjob1) self.subjob2.add_parent(self) self.add_child(self.subjob2) else: _LOG.debug("%s ... recursing further " % prefix) self.skip_merge = True # Reroot near centroid edge ce = self.tree.get_centroid_edge(spanning=True) nr = ce.head_node if not ce.head_node.is_leaf() else ce.tail_node self.tree._tree.reroot_at_node(nr,delete_outdegree_one=False) _LOG.debug("rerooted to: %s ..." % self.tree.compose_newick()[0:200]) # For each path from root to its children, create a new merge job merge_job_list = [] nr = self.tree._tree.seed_node children = nr.child_nodes() for keepchild in children: remchilds = [] for remchild in children: if remchild != keepchild: remchilds.append(nr.reversible_remove_child(remchild, suppress_deg_two=False)) t1 = PhylogeneticTree(Tree(self.tree._tree)) remchilds.reverse() for child in remchilds: nr.reinsert_nodes(child) _LOG.debug("child = %s ..." % t1.compose_newick()[0:200]) multilocus_dataset1 = self.multilocus_dataset.new_with_shared_meta() if t1.count_nodes() == 2: ns = t1._tree.nodes() tmp_dir_par = self.get_pairwise_temp_dir(ns[0].label, ns[1].label) else: tmp_dir_par = self.tmp_base_dir configuration = self.configuration() cj = PASTAMergerJob(multilocus_dataset=multilocus_dataset1, pasta_team=self.pasta_team, tree=t1, tmp_base_dir=self.tmp_base_dir, tmp_dir_par= tmp_dir_par, delete_temps2=False, **configuration) cj.add_parent(self) self.add_child(cj) merge_job_list.append(cj); self.merge_job_list = merge_job_list # now launch these new merge jobs for merge_job in self.merge_job_list: if self.killed: raise RuntimeError("PastaAligner Job killed") merge_job.launch_alignment() self._merge_queued_event.set() if self.killed: raise RuntimeError("PastaAligner Job killed") return