Esempio n. 1
0
 def load_stored_data(self):
     self.cogs = db.get_task_data(self.taskid, DATATYPES.cogs)
     self.cog_analysis = db.get_task_data(self.taskid, DATATYPES.cog_analysis)
Esempio n. 2
0
 def load_stored_data(self):
     # self.tree_file = db.get_dataid(self.taskid, DATATYPES.tree)
     self.stats = db.get_task_data(self.taskid, DATATYPES.tree_stats)
Esempio n. 3
0
 def load_stored_data(self):
     self.best_model = db.get_task_data(self.taskid, DATATYPES.best_model)
     self.model_ranking = db.get_task_data(self.taskid, DATATYPES.model_ranking)
Esempio n. 4
0
 def load_stored_data(self):
     self.kept_columns[:] = []  # clear list
     self.kept_columns.append(db.get_task_data(self.taskid, DATATYPES.kept_alg_columns))
Esempio n. 5
0
def split_tree(task_tree_node, task_outgroups, main_tree, alg_path, npr_conf, threadid, target_cladeids):
    """Browses a task tree from root to leaves and yields next
    suitable nodes for NPR iterations. Each yielded node comes with
    the set of target and outgroup tips. 
    """


    def processable_node(_n):
        """This an internal function that returns true if a given node
        is suitable for a NPR iteration. It can be used as
        "is_leaf_fn" when traversing a tree.

        Note that this function uses several variables which change within the
        split_tree function, so must be kept within its namespace.

        """
        is_leaf = False
        for wkname, wkfilter in npr_conf.npr_workflows:
            # if node is not in the targets or does not meet size filters, skip
            # workflow
            if _n is master_node or \
               (_TARGET_NODES and _n not in _TARGET_NODES) or \
               (target_cladeids and _n.cladeid not in target_cladeids) or \
               len(n2content[_n]) < max(wkfilter.get("min_size", 3), 3) or \
               ("max_size" in wkfilter and len(n2content[_n]) > wkfilter["max_size"]):
                continue

            # If seq_sim filter used, calculate node stats
            if ALG and ("min_seq_sim" in wkfilter or "max_seq_sim" in wkfilter): 
                if not hasattr(_n, "seqs_mean_ident"):
                    log.log(20, "Calculating node sequence stats...")
                    mx, mn, avg, std = get_seqs_identity(ALG,
                                                         [__n.name for __n in n2content[_n]])
                    _n.add_features(seqs_max_ident=mx, seqs_min_ident=mn,
                                    seqs_mean_ident=avg, seqs_std_ident=std)
                    log.log(20, "mx=%s, mn=%s, avg=%s, std=%s" %(mx, mn, avg, std))
                    

                if _n.seqs_mean_ident < wkfilter["min_seq_sim"]:
                    continue
                    
                if _n.seqs_mean_ident > wkfilter["max_seq_sim"]:
                    continue

                    
            else:
                _n.add_features(seqs_max_ident=None, seqs_min_ident=None,
                                seqs_mean_ident=None, seqs_std_ident=None)

            if "min_support" in wkfilter:
                # If we are optimizing only lowly supported nodes, and nodes are
                # optimized without an outgroup, our target node is actually the
                # parent of lowly supported nodes. Therefore, I check if support
                # is low in children nodes, and return this node if so.
                if not npr_conf.use_outgroup:
                    if not [_ch for _ch in _n.children if _ch.support <= wkfilter["min_support"]]:
                        continue
                # Otherwise, just skip the node if it above the min support
                elif _n.support > wkfilter["min_support"]:
                    continue

            # At this point, node passed all filters of this workflow were met,
            # so it can be optimized
            is_leaf = True
            _n._target_wkname = wkname
            break
                
        return is_leaf
        
    log.log(20, "Loading tree content...")
    n2content = main_tree.get_cached_content()
    if alg_path:
        log.log(20, "Loading associated alignment to check seq. similarity")
        raw_alg = db.get_task_data(*alg_path.split("."))
        ALG = SeqGroup(raw_alg)
    else:
        ALG = None

    log.log(20, "Finding next NPR nodes...")
    # task_tree_node is actually a node in main_tree, since it has been
    # already merged
    trees_to_browse = [task_tree_node]
    npr_nodes = 0
    # loads current tree content, so we can check not reconstructing exactly the
    # same tree
    tasktree_content = set([leaf.name for leaf in n2content[task_tree_node]]) | set(task_outgroups)
    while trees_to_browse: 
        master_node = trees_to_browse.pop()

        # if custom taxa levels are defined as targets, find them in this
        # subtree
        _TARGET_NODES = defaultdict(list) # this container is used by
                                          # processable_node function
        opt_levels = GLOBALS[threadid].get('_optimized_levels', None)
        if opt_levels is not None:
            # any descendant of the already processed node is suitable for
            # selection. If the ancestor of level-species is on top of the
            # task_tree_node, it will be discarded
            avail_nodes = set(master_node.get_descendants())
            for lin in opt_levels:
                sp2lin, lin2sp = GLOBALS["lineages"]
                optimized, strict_monophyly = opt_levels[lin]
                if not optimized:
                    ancestor = main_tree.get_common_ancestor(*lin2sp[lin])
                    if ancestor in avail_nodes:
                        # check that the node satisfies level monophyly config
                        ancestor_content = set([x.name for x in n2content[ancestor]])
                        if not strict_monophyly or lin2sp[lin] == ancestor_content:
                            _TARGET_NODES[ancestor].append(lin)
                        elif strict_monophyly:
                            log.log(26, "Discarding not monophyletic level @@11:%s@@1:" %lin)
                    else:
                        log.log(26, "Discarding upper clade @@11:%s@@1:" %lin)
                        
        for node in master_node.iter_leaves(is_leaf_fn=processable_node):
            if opt_levels:
                log.log(28, "Trying to optimizing custom tree level: @@11:%s@@1:" %_TARGET_NODES[node])
                for lin in _TARGET_NODES[node]:
                    # Marks the level as optimized, so is not computed again
                    opt_levels[lin][0] = True
           
            log.log(28, "Found possible target node of size %s branch support %f" %(len(n2content[node]), node.support))
            log.log(28, "First suitable workflow: %s" %(node._target_wkname))

            # Finds best outgroup for the target node
            if npr_conf.use_outgroup:
                splitterconfname, _ = npr_conf.tree_splitter
                splitterconf = GLOBALS[threadid][splitterconfname]
                #seqs, outs = select_outgroups(node, n2content, splitterconf)
                #seqs, outs = select_closest_outgroup(node, n2content, splitterconf)
                seqs, outs = select_sister_outgroup(node, n2content, splitterconf)
            else:
                seqs = set([_i.name for _i in n2content[node]])
                outs = set()
                
                
            if seqs | outs == tasktree_content:
                log.log(26, "Discarding target node of size %s, due to identity with its parent node" %len(n2content[node]))
                #print tasktree_content
                #print seqs
                #print outs
                trees_to_browse.append(node)
            else:
                npr_nodes += 1
                yield node, seqs, outs, node._target_wkname
    log.log(28, "%s nodes will be optimized", npr_nodes)
Esempio n. 6
0
def process_task(task, wkname, npr_conf, nodeid2info):
    alignerconf, alignerclass = npr_conf.aligner
    cleanerconf, cleanerclass = npr_conf.alg_cleaner
    mtesterconf, mtesterclass = npr_conf.model_tester
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    if not treebuilderclass:
        # Allows to dump algs in workflows with no tree tasks
        treebuilderclass = DummyTree
   
    splitterconf, splitterclass = npr_conf.tree_splitter
    
    conf = GLOBALS[task.configid]
    seqtype = task.seqtype
    nodeid = task.nodeid
    ttype = task.ttype
    taskid = task.taskid
    threadid = task.threadid
    node_info = nodeid2info[nodeid]
    size = task.size#node_info.get("size", 0)
    target_seqs = node_info.get("target_seqs", [])
    out_seqs = node_info.get("out_seqs", [])

    if not treebuilderclass or size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree
        mtesterclass = None
        
    # If more than one outgroup are used, enable the use of constrain
    if out_seqs and len(out_seqs) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None
    
    new_tasks = []
    if ttype == "msf":
        # Register Tree constrains
        constrain_tree = "(%s, (%s));" %(','.join(sorted(task.out_seqs)), 
                                         ','.join(sorted(task.target_seqs)))
        _outs = "\n".join(map(lambda name: ">%s\n0" %name, sorted(task.out_seqs)))
        _tars = "\n".join(map(lambda name: ">%s\n1" %name, sorted(task.target_seqs)))
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit() # since the creation of some Task
                               # objects may require this info, I need
                               # to commit right now.

        # Register node
        db.add_node(task.threadid,
                    task.nodeid, task.cladeid,
                    task.target_seqs,
                    task.out_seqs)
       
        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = task.target_seqs
        nodeid2info[nodeid]["out_seqs"] = task.out_seqs
        alg_task = alignerclass(nodeid, task.multiseq_file,
                                seqtype, conf, alignerconf)
        alg_task.size = task.size
        new_tasks.append(alg_task)
       

    elif ttype == "alg" or ttype == "acleaner":
        if ttype == "alg":
            nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file
        elif ttype == "acleaner":
            nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file
        
        alg_fasta_file = getattr(task, "clean_alg_fasta_file",
                                 task.alg_fasta_file)
        alg_phylip_file = getattr(task, "clean_alg_phylip_file",
                                  task.alg_phylip_file)

        # Calculate alignment stats           
        # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file, 
        #                                        conf["app"]["trimal"])
        #  
        # max_identity = get_trimal_identity(task.alg_fasta_file, 
        #                                 conf["app"]["trimal"])
        # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std)
        # log.info("Max. Identity: %0.2f", max_identity)
        #import time
        #t1 = time.time()
        #mx, mn, mean, std = get_identity(task.alg_fasta_file)
        #print time.time()-t1
        #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        #t1 = time.time()

        if seqtype == "aa" and npr_conf.switch_aa_similarity < 1:
            try:
                alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats) 
            except Exception, e:
                alg_stats = {}

            if ttype == "alg":
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                dataid = DATATYPES.alg_phylip
            elif ttype == "acleaner":
                algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file)
                dataid = DATATYPES.clean_alg_phylip

            if "i_mean" not in alg_stats:
                log.log(24, "Calculating alignment stats...")
                # dump data if necesary
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                if not pexist(algfile): 
                    # dump phylip alg
                    open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid))) 

                mx, mn, mean, std = get_statal_identity(algfile,
                                                        conf["app"]["statal"])
                alg_stats = {"i_max":mx, "i_mean":mean, "i_min":mn, "i_std":std}
                db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats)

            log.log(22, "Alignment stats (sequence similarity):")
            log.log(22, "   max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" %
                    (alg_stats))

        else:
            alg_stats = {"i_max":-1, "i_mean":-1, "i_min":-1, "i_std":-1}
        
        #print time.time()-t1
        #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        task.max_ident = alg_stats["i_max"]
        task.min_ident = alg_stats["i_min"]
        task.mean_ident = alg_stats["i_mean"]
        task.std_ident = alg_stats["i_std"]
        next_task = None

        if ttype == "alg" and cleanerclass:
            next_task = cleanerclass(nodeid, seqtype, alg_fasta_file,
                                     alg_phylip_file,
                                     conf, cleanerconf)
        else: 
            # Converts aa alignment into nt if necessary
            if  seqtype == "aa" and \
                    "nt" in GLOBALS["seqtypes"] and \
                    task.mean_ident >= npr_conf.switch_aa_similarity:
                log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\
                        (task.mean_ident, npr_conf.switch_aa_similarity))
                alg_fasta_file = "%s.%s" %(taskid, DATATYPES.alg_nt_fasta)
                alg_phylip_file = "%s.%s" %(taskid, DATATYPES.alg_nt_phylip)
                try:
                    alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta)
                    alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip)
                except ValueError:
                    log.log(22, "Calculating codon alignment...")

                    source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file)
                    if ttype == "alg":
                        kept_columns = []
                    elif ttype == "acleaner":
                        # if original alignment was trimmed, use it as reference
                        # but make the nt alignment only on the kept columns
                        kept_columns = db.get_task_data(taskid, DATATYPES.kept_alg_columns)

                    if not pexist(source_alg):
                        open(source_alg, "w").write(db.get_task_data(taskid, DATATYPES.alg_fasta)) 

                    nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns)
                    db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write())
                    db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed'))

                npr_conf = IterConfig(conf, wkname, task.size, "nt")
                seqtype = "nt"
                                          
            if mtesterclass:
                next_task = mtesterclass(nodeid, alg_fasta_file,
                                         alg_phylip_file,
                                         constrain_id,
                                         conf, mtesterconf)
            elif treebuilderclass:
                next_task = treebuilderclass(nodeid, alg_phylip_file,
                                             constrain_id,
                                             None, seqtype,
                                             conf, treebuilderconf)
        if next_task:
            next_task.size = task.size
            new_tasks.append(next_task)