def select_closest_outgroup(target, n2content, splitterconf): def sort_outgroups(x,y): r = cmp(x[1], y[1]) # closer node if r == 0: r = -1 * cmp(len(n2content[x[0]]), len(n2content[y[0]])) # larger node if r == 0: r = -1 * cmp(x[0].support, y[0].support) # higher supported node if r == 0: return cmp(x[0].cladeid, y[0].cladeid) # by content name else: return r else: return r else: return r if not target.up: raise TaskError(None, "Cannot select outgroups for the root node!") # Prepare cutoffs out_topodist = tobool(splitterconf["_outgroup_topology_dist"]) max_outgroup_size = max(int(float(splitterconf["_max_outgroup_size"]) * len(n2content[target])), 1) out_min_support = float(splitterconf["_min_outgroup_support"]) log.log(26, "Max outgroup size allowed %d" %max_outgroup_size) # Gets a list of outside nodes an their distance to current target node n2targetdist = distance_matrix_new(target, leaf_only=False, topology_only=out_topodist) valid_nodes = sorted([(node, ndist) for node, ndist in n2targetdist.iteritems() if not(n2content[node] & n2content[target]) and node.support >= out_min_support and len(n2content[node])<=max_outgroup_size], sort_outgroups) if valid_nodes: best_outgroup = valid_nodes[0][0] else: print '\n'.join(sorted(["%s Size:%d Dist:%f Supp:%f" %(node.cladeid, len(n2content[node]), ndist, node.support) for node, ndist in n2targetdist.iteritems()], sort_outgroups)) raise TaskError(None, "Could not find a suitable outgroup!") log.log(20, "Found possible outgroup Size:%d Distance:%f Support:%f", len(n2content[best_outgroup]), n2targetdist[best_outgroup], best_outgroup.support) log.log(20, "Supports: %0.2f (children=%s)", best_outgroup.support, ','.join(["%0.2f" % ch.support for ch in best_outgroup.children])) log.log(24, "best outgroup topology:\n%s", best_outgroup) #print target #print target.get_tree_root() seqs = [n.name for n in n2content[target]] outs = [n.name for n in n2content[best_outgroup]] return set(seqs), set(outs)
def process_task(task, wkname, npr_conf, nodeid2info): cogconf, cogclass = npr_conf.cog_selector concatconf, concatclass = npr_conf.alg_concatenator treebuilderconf, treebuilderclass = npr_conf.tree_builder splitterconf, splitterclass = npr_conf.tree_splitter threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid, task.seqtype, task.ttype) cladeid, targets, outgroups = db.get_node_info(threadid, nodeid) if not treebuilderclass or task.size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree if outgroups and len(outgroups) > 1: constrain_id = nodeid else: constrain_id = None node_info = nodeid2info[nodeid] conf = GLOBALS[task.configid] new_tasks = [] if ttype == "cog_selector": # Generates a md5 id based on the genetree configuration workflow used # for the concat alg task. If something changes, concat alg will change # and the associated tree will be rebuilt config_blocks = set([wkname]) for key, value in conf[wkname].iteritems(): if isinstance(value, list) or isinstance(value, tuple) \ or isinstance(value, set): for elem in value: config_blocks.add(elem[1:]) if isinstance(elem, str) and elem.startswith("@") else None elif isinstance(value, str): config_blocks.add(value[1:]) if value.startswith("@") else None config_checksum = md5(''.join(["[%s]\n%s" %(x, dict_string(conf[x])) for x in sorted(config_blocks)])) # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK # Check that current selection of cogs will cover all target and # outgroup species #cog_hard_limit = int(conf[concatconf]["_max_cogs"]) #sp_repr = defaultdict(int) #for co in task.raw_cogs[:cog_hard_limit]: # for sp, seq in co: # sp_repr[sp] += 1 #missing_sp = (targets | outgroups) - set(sp_repr.keys()) #if missing_sp: # raise TaskError("missing species under current cog selection: %s" %missing_sp) #else: # log.log(28, "Analysis of current COG selection:") # for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]): # log.log(28, " % 30s species present in % 6d COGs" %(sp, ncogs)) # register concat alignment task. NodeId associated to concat_alg tasks # and all its children jobs should take into account cog information and # not only species and outgroups included. concat_job = concatclass(task.cogs, seqtype, conf, concatconf, config_checksum) db.add_node(threadid, concat_job.nodeid, cladeid, targets, outgroups) # Register Tree constrains constrain_tree = "(%s, (%s));" %(','.join(sorted(outgroups)), ','.join(sorted(targets))) _outs = "\n".join(map(lambda name: ">%s\n0" %name, sorted(outgroups))) _tars = "\n".join(map(lambda name: ">%s\n1" %name, sorted(targets))) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task objects # may require this info, I need to commit # right now. concat_job.size = task.size new_tasks.append(concat_job) elif ttype == "concat_alg": # register tree for concat alignment, using constraint tree if # necessary alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip) try: parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions) except ValueError: parts_id = None nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = targets nodeid2info[nodeid]["out_seqs"] = outgroups tree_task = treebuilderclass(nodeid, alg_id, constrain_id, None, task.seqtype, conf, treebuilderconf, parts_id=parts_id) tree_task.size = task.size new_tasks.append(tree_task) elif ttype == "tree": merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf) merger_task.size = task.size new_tasks.append(merger_task) elif ttype == "treemerger": # Lets merge with main tree if not task.task_tree: task.finish() log.log(24, "Saving task tree...") annotate_node(task.task_tree, task) db.update_node(nid=task.nodeid, runid=task.threadid, newick=db.encode(task.task_tree)) db.commit() if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1: current_iter = get_iternumber(threadid) if npr_conf.max_iters and current_iter >= npr_conf.max_iters: log.warning("Maximum number of iterations reached!") else: # Add new nodes source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt" ttree, mtree = task.task_tree, task.main_tree log.log(26, "Processing tree: %s seqs, %s outgroups", len(targets), len(outgroups)) target_cladeids = None if tobool(conf[splitterconf].get("_find_ncbi_targets", False)): tcopy = mtree.copy() ncbi.connect_database() tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, None) #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example n2content = tcopy.get_cached_content() broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches(tcopy, n2content) log.log(28, 'restricting NPR to broken clades: '+ colorify(', '.join(map(lambda x: "%s"%tax2name[x], broken_clades)), "wr")) target_cladeids = set() for branch in broken_branches: print branch.get_ascii(attributes=['spname', 'taxid'], compact=True) print map(lambda x: "%s"%tax2name[x], broken_branches[branch]) target_cladeids.add(branch.cladeid) for node, seqs, outs, wkname in get_next_npr_node(task.configid, ttree, task.out_seqs, mtree, None, npr_conf, target_cladeids): # None is to avoid alg checks log.log(24, "Adding new node: %s seqs, %s outgroups", len(seqs), len(outs)) new_task_node = cogclass(seqs, outs, source_seqtype, conf, cogconf) new_task_node.target_wkname = wkname new_tasks.append(new_task_node) db.add_node(threadid, new_task_node.nodeid, new_task_node.cladeid, new_task_node.targets, new_task_node.outgroups) return new_tasks
def select_outgroups(target, n2content, splitterconf): """Given a set of target sequences, find the best set of out sequences to use. Several ways can be selected to find out sequences: """ name2dist = {"min": numpy.min, "max": numpy.max, "mean":numpy.mean, "median":numpy.median} #policy = splitterconf["_outgroup_policy"] # node or leaves out_topodist = tobool(splitterconf["_outgroup_topology_dist"]) optimal_out_size = int(splitterconf["_max_outgroup_size"]) #out_distfn = splitterconf["_outgroup_dist"] out_min_support = float(splitterconf["_outgroup_min_support"]) if not target.up: raise TaskError(None, "Cannot select outgroups for the root node!") if not optimal_out_size: raise TaskError(None, "You are trying to set 0 outgroups!") # Gets a list of outside nodes an their distance to current target node n2targetdist = distance_matrix_new(target, leaf_only=False, topology_only=out_topodist) #kk, test = distance_matrix(target, leaf_only=False, # topology_only=False) #for x in test: # if test[x] != n2targetdist[x]: # print x # print test[x], n2targetdist[x] # print x.get_distance(target) # raw_input("ERROR!") score = lambda _n: (_n.support, #len(n2content[_n])/float(optimal_out_size), 1 - (abs(optimal_out_size - len(n2content[_n])) / float(max(optimal_out_size, len(n2content[_n])))), # outgroup size 1 - (n2targetdist[_n] / max_dist) #outgroup proximity to target ) def sort_outgroups(x,y): score_x = set(score(x)) score_y = set(score(y)) while score_x: min_score_x = min(score_x) v = cmp(min_score_x, min(score_y)) if v == 0: score_x.discard(min_score_x) score_y.discard(min_score_x) else: break # If still equal, sort by cladid to maintain reproducibility if v == 0: v = cmp(x.cladeid, y.cladeid) return v #del n2targetdist[target.get_tree_root()] max_dist = max(n2targetdist.values()) valid_nodes = [n for n in n2targetdist if \ not n2content[n] & n2content[target] and n.support >= out_min_support] if not valid_nodes: raise TaskError(None, "Could not find a suitable outgroup (min_support=%s)"\ %out_min_support) valid_nodes.sort(sort_outgroups, reverse=True) best_outgroup = valid_nodes[0] seqs = [n.name for n in n2content[target]] outs = [n.name for n in n2content[best_outgroup]] log.log(20, "Found possible outgroup of size %s: score (support,size,dist)=%s", len(outs), score(best_outgroup)) log.log(20, "Supports: %0.2f (children=%s)", best_outgroup.support, ','.join(["%0.2f" % ch.support for ch in best_outgroup.children])) if DEBUG(): root = target.get_tree_root() for _seq in outs: tar = root & _seq tar.img_style["fgcolor"]="green" tar.img_style["size"] = 12 tar.img_style["shape"] = "circle" target.img_style["bgcolor"] = "lightblue" NPR_TREE_STYLE.title.clear() NPR_TREE_STYLE.title.add_face( faces.TextFace("MainTree:" " Outgroup selection is mark in green. Red=optimized nodes ", fgcolor="blue"), 0) root.show(tree_style=NPR_TREE_STYLE) for _n in root.traverse(): _n.img_style = None return set(seqs), set(outs)