Esempio n. 1
0
def select_closest_outgroup(target, n2content, splitterconf):
    def sort_outgroups(x,y):
        r = cmp(x[1], y[1]) # closer node
        if r == 0:
            r = -1 * cmp(len(n2content[x[0]]), len(n2content[y[0]])) # larger node
            if r == 0:
                r = -1 * cmp(x[0].support, y[0].support) # higher supported node
                if r == 0:
                    return cmp(x[0].cladeid, y[0].cladeid) # by content name
                else:
                    return r
            else:
                return r
        else:
            return r
    
    if not target.up:
        raise TaskError(None, "Cannot select outgroups for the root node!")
        
    # Prepare cutoffs
    out_topodist = tobool(splitterconf["_outgroup_topology_dist"])
    max_outgroup_size = max(int(float(splitterconf["_max_outgroup_size"]) * len(n2content[target])), 1)
    out_min_support = float(splitterconf["_min_outgroup_support"])

    log.log(26, "Max outgroup size allowed %d" %max_outgroup_size)
    
    # Gets a list of outside nodes an their distance to current target node
    n2targetdist = distance_matrix_new(target, leaf_only=False,
                                               topology_only=out_topodist)

    valid_nodes = sorted([(node, ndist) for node, ndist in n2targetdist.iteritems()
                          if not(n2content[node] & n2content[target])
                          and node.support >= out_min_support 
                          and len(n2content[node])<=max_outgroup_size],
                         sort_outgroups)
    if valid_nodes:
        best_outgroup = valid_nodes[0][0]
    else:
        print '\n'.join(sorted(["%s Size:%d Dist:%f Supp:%f" %(node.cladeid, len(n2content[node]), ndist, node.support)
                                for node, ndist in n2targetdist.iteritems()],
                               sort_outgroups))
        raise TaskError(None, "Could not find a suitable outgroup!")

    log.log(20,
            "Found possible outgroup Size:%d Distance:%f Support:%f",
            len(n2content[best_outgroup]), n2targetdist[best_outgroup], best_outgroup.support)
   
    log.log(20, "Supports: %0.2f (children=%s)", best_outgroup.support,
            ','.join(["%0.2f" % ch.support for ch in
                      best_outgroup.children]))
    
    log.log(24, "best outgroup topology:\n%s", best_outgroup)
    #print target
    #print target.get_tree_root()
   
    seqs = [n.name for n in n2content[target]]
    outs = [n.name for n in n2content[best_outgroup]]
    
    return set(seqs), set(outs)
Esempio n. 2
0
def process_task(task, wkname, npr_conf, nodeid2info):
    cogconf, cogclass = npr_conf.cog_selector
    concatconf, concatclass = npr_conf.alg_concatenator
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    splitterconf, splitterclass = npr_conf.tree_splitter
    
    threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid,
                                        task.seqtype, task.ttype)
    cladeid, targets, outgroups = db.get_node_info(threadid, nodeid)

    if not treebuilderclass or task.size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree
    
    if outgroups and len(outgroups) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None
        
    node_info = nodeid2info[nodeid]
    conf = GLOBALS[task.configid]
    new_tasks = []    
    if ttype == "cog_selector":
       
        # Generates a md5 id based on the genetree configuration workflow used
        # for the concat alg task. If something changes, concat alg will change
        # and the associated tree will be rebuilt
        config_blocks = set([wkname])
        for key, value in conf[wkname].iteritems():
            if isinstance(value, list) or  isinstance(value, tuple) \
                    or isinstance(value, set):
                for elem in value:
                    config_blocks.add(elem[1:]) if isinstance(elem, str) and elem.startswith("@") else None
            elif isinstance(value, str):
                config_blocks.add(value[1:]) if value.startswith("@") else None
        config_checksum =  md5(''.join(["[%s]\n%s" %(x, dict_string(conf[x]))
                                        for x in sorted(config_blocks)]))

        # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK
        # Check that current selection of cogs will cover all target and
        # outgroup species
        #cog_hard_limit = int(conf[concatconf]["_max_cogs"])
        #sp_repr = defaultdict(int)
        #for co in task.raw_cogs[:cog_hard_limit]:
        #    for sp, seq in co:
        #        sp_repr[sp] += 1
        #missing_sp = (targets | outgroups) - set(sp_repr.keys())
        #if missing_sp:
        #    raise TaskError("missing species under current cog selection: %s" %missing_sp)
        #else:
        #    log.log(28, "Analysis of current COG selection:")
        #    for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]):
        #        log.log(28, "   % 30s species present in % 6d COGs" %(sp, ncogs))
                
        # register concat alignment task. NodeId associated to concat_alg tasks
        # and all its children jobs should take into account cog information and
        # not only species and outgroups included.
        
        concat_job = concatclass(task.cogs, seqtype, conf, concatconf,
                                 config_checksum)
        db.add_node(threadid,
                    concat_job.nodeid, cladeid,
                    targets, outgroups)

        # Register Tree constrains
        constrain_tree = "(%s, (%s));" %(','.join(sorted(outgroups)), 
                                         ','.join(sorted(targets)))
        _outs = "\n".join(map(lambda name: ">%s\n0" %name, sorted(outgroups)))
        _tars = "\n".join(map(lambda name: ">%s\n1" %name, sorted(targets)))
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit() # since the creation of some Task objects
                             # may require this info, I need to commit
                             # right now.
        concat_job.size = task.size
        new_tasks.append(concat_job)
       
    elif ttype == "concat_alg":
        # register tree for concat alignment, using constraint tree if
        # necessary
        alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip)
        try:
            parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions)
        except ValueError:
            parts_id = None

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = targets
        nodeid2info[nodeid]["out_seqs"] = outgroups
        tree_task = treebuilderclass(nodeid, alg_id,
                                     constrain_id, None,
                                     task.seqtype, conf, treebuilderconf,
                                     parts_id=parts_id)
        tree_task.size = task.size
        new_tasks.append(tree_task)
        
    elif ttype == "tree":
        merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf)
        merger_task.size = task.size
        new_tasks.append(merger_task)

    elif ttype == "treemerger":
        # Lets merge with main tree
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid, runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree

                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(targets), len(outgroups))

                target_cladeids = None
                if tobool(conf[splitterconf].get("_find_ncbi_targets", False)):
                    tcopy = mtree.copy()
                    ncbi.connect_database()
                    tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, None)
                    #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example
                    n2content = tcopy.get_cached_content()
                    broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches(tcopy, n2content)
                    log.log(28, 'restricting NPR to broken clades: '+
                            colorify(', '.join(map(lambda x: "%s"%tax2name[x], broken_clades)), "wr"))
                    target_cladeids = set()
                    for branch in broken_branches:
                        print branch.get_ascii(attributes=['spname', 'taxid'], compact=True)
                        print map(lambda x: "%s"%tax2name[x], broken_branches[branch])
                        target_cladeids.add(branch.cladeid)

                for node, seqs, outs, wkname in get_next_npr_node(task.configid, ttree,
                                                          task.out_seqs, mtree, None,
                                                          npr_conf, target_cladeids): # None is to avoid alg checks
                    log.log(24, "Adding new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = cogclass(seqs, outs,
                                             source_seqtype, conf, cogconf)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
                    db.add_node(threadid,
                                new_task_node.nodeid, new_task_node.cladeid,
                                new_task_node.targets,
                                new_task_node.outgroups)
    return new_tasks
Esempio n. 3
0
def select_outgroups(target, n2content, splitterconf):
    """Given a set of target sequences, find the best set of out
    sequences to use. Several ways can be selected to find out
    sequences:
    """
    
    name2dist = {"min": numpy.min, "max": numpy.max,
                 "mean":numpy.mean, "median":numpy.median}
  
    
    #policy = splitterconf["_outgroup_policy"]  # node or leaves
    out_topodist = tobool(splitterconf["_outgroup_topology_dist"])
    optimal_out_size = int(splitterconf["_max_outgroup_size"])
    #out_distfn = splitterconf["_outgroup_dist"]
    out_min_support = float(splitterconf["_outgroup_min_support"])
    
    if not target.up:
        raise TaskError(None, "Cannot select outgroups for the root node!")
    if not optimal_out_size:
        raise TaskError(None, "You are trying to set 0 outgroups!")

    # Gets a list of outside nodes an their distance to current target node
    n2targetdist = distance_matrix_new(target, leaf_only=False,
                                               topology_only=out_topodist)

    #kk, test = distance_matrix(target, leaf_only=False,
    #                       topology_only=False)

    #for x in test:
    #    if test[x] != n2targetdist[x]:
    #        print x
    #        print test[x],  n2targetdist[x]
    #        print x.get_distance(target)
    #        raw_input("ERROR!")
        
    score = lambda _n: (_n.support,
                        #len(n2content[_n])/float(optimal_out_size),
                        1 - (abs(optimal_out_size - len(n2content[_n])) / float(max(optimal_out_size, len(n2content[_n])))), # outgroup size
                        1 - (n2targetdist[_n] / max_dist) #outgroup proximity to target
                        ) 
    
    def sort_outgroups(x,y):
        score_x = set(score(x))
        score_y = set(score(y))
        while score_x:
            min_score_x = min(score_x)

            v = cmp(min_score_x, min(score_y))
            if v == 0:
                score_x.discard(min_score_x)
                score_y.discard(min_score_x)
            else:
                break
        # If still equal, sort by cladid to maintain reproducibility
        if v == 0:
            v = cmp(x.cladeid, y.cladeid)
        return v
        
    #del n2targetdist[target.get_tree_root()]
    max_dist = max(n2targetdist.values())
    valid_nodes = [n for n in n2targetdist if \
                       not n2content[n] & n2content[target] and
                       n.support >= out_min_support]
    if not valid_nodes:
        raise TaskError(None, "Could not find a suitable outgroup (min_support=%s)"\
                      %out_min_support)
    valid_nodes.sort(sort_outgroups, reverse=True)
    best_outgroup = valid_nodes[0]
    seqs = [n.name for n in n2content[target]]
    outs = [n.name for n in n2content[best_outgroup]]
   
    log.log(20,
            "Found possible outgroup of size %s: score (support,size,dist)=%s",
            len(outs), score(best_outgroup))
   
    log.log(20, "Supports: %0.2f (children=%s)", best_outgroup.support,
            ','.join(["%0.2f" % ch.support for ch in
                      best_outgroup.children]))
    
    if DEBUG():
        root = target.get_tree_root()
        for _seq in outs:
            tar =  root & _seq
            tar.img_style["fgcolor"]="green"
            tar.img_style["size"] = 12
            tar.img_style["shape"] = "circle"
        target.img_style["bgcolor"] = "lightblue"
        NPR_TREE_STYLE.title.clear()
        NPR_TREE_STYLE.title.add_face( faces.TextFace("MainTree:"
            " Outgroup selection is mark in green. Red=optimized nodes ",
            fgcolor="blue"), 0)
        root.show(tree_style=NPR_TREE_STYLE)
        for _n in root.traverse():
            _n.img_style = None
        
    return set(seqs), set(outs)