Example #1
0
File: phyml.py Project: jhcepas/npr
    def __init__(self, nodeid, alg_phylip_file, constrain_id, model,
                 seqtype, conf, confname, parts_id=None):

        GLOBALS["citator"].add(PHYML_CITE)
        
        base_args = OrderedDict({
                "--model": "", 
                "--no_memory_check": "", 
                "--quiet": "",
                "--constraint_tree": ""})
        
        self.confname = confname
        self.conf = conf
        self.constrain_tree = None
        if constrain_id:
            self.constrain_tree = db.get_dataid(constrain_id, DATATYPES.constrain_tree)
        self.alg_phylip_file = alg_phylip_file
        
        TreeTask.__init__(self, nodeid, "tree", "Phyml", 
                          base_args, conf[confname])

        if seqtype == "aa":
            self.model = model or conf[confname]["_aa_model"]
        elif seqtype == "nt":
            self.model = model or conf[confname]["_nt_model"]
        self.seqtype = seqtype
        self.lk = None

        self.init()
Example #2
0
    def __init__(self, nodeid, alg_file, constrain_id, model, seqtype, conf, confname, parts_id=None):
        GLOBALS["citator"].add(FASTTREE_CITE)

        self.confname = confname
        self.conf = conf
        self.alg_phylip_file = alg_file
        self.constrain_tree = None
        if constrain_id:
            self.constrain_tree = db.get_dataid(constrain_id, DATATYPES.constrain_alg)
        self.alg_basename = basename(self.alg_phylip_file)
        self.seqtype = seqtype
        self.tree_file = ""
        if model:
            log.warning("FastTree does not support model selection")

        self.model = None
        self.lk = None

        base_args = OrderedDict()
        base_args["-nopr"] = ""
        if self.seqtype == "nt":
            base_args["-gtr -nt"] = ""
        elif self.seqtype == "aa":
            pass
        else:
            raise ValueError("Unknown seqtype %s" % self.seqtype)

        TreeTask.__init__(self, nodeid, "tree", "FastTree", base_args, self.conf[confname])

        self.init()
Example #3
0
 def finish(self):
     if self.conf[self.confname]["_alg_trimming"]:
         # If trimming happened after mcoffee, let's save the
         # resulting output
         trim_job = self.jobs[-1]
         alg = SeqGroup(pjoin(trim_job.jobdir, trim_job.alg_fasta_file))
         fasta = alg.write(format="fasta")
         phylip = alg.write(format="iphylip_relaxed")
         AlgTask.store_data(self, fasta, phylip)
     else:
         # If no post trimming, output is just what Mcoffee
         # produced, so we can recycle its data ids.
         mc_task = self.jobs[-1]
         fasta_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_fasta)
         phylip_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_phylip)
         db.register_task_data(self.taskid, DATATYPES.alg_fasta, fasta_id)
         db.register_task_data(self.taskid, DATATYPES.alg_phylip, phylip_id)
Example #4
0
def get_stored_data(fileid):
    try:
        _tid, _did = fileid.split(".")
        _did = int(_did)
    except (IndexError, ValueError): 
        dataid = fileid
    else:
        dataid = db.get_dataid(_tid, _did)
    return db.get_data(dataid)
Example #5
0
File: raxml.py Project: jhcepas/npr
    def __init__(self, nodeid, alg_file, constrain_id, model,
                 seqtype, conf, confname, parts_id=None):
        GLOBALS["citator"].add(RAXML_CITE)

        base_args = OrderedDict()
        self.bootstrap = conf[confname].get("_bootstrap", None)
        
        model = model or conf[confname]["_aa_model"]
        
        self.confname = confname
        self.conf = conf
        self.alg_phylip_file = alg_file
        
        try:
            self.constrain_tree = db.get_dataid(constrain_id, DATATYPES.constrain_tree)
        except ValueError:
            self.constrain_tree = None

        self.partitions_file = parts_id
            
        TreeTask.__init__(self, nodeid, "tree", "RaxML", 
                          base_args, conf[confname])

        max_cores = GLOBALS["_max_cores"]
        appname = conf[confname]["_app"]
        if max_cores > 1:
            threads = conf["threading"].get("raxml-pthreads")
            if threads > 1:
                appname = appname.replace("raxml", "raxml-pthreads")
                raxml_bin = conf["app"][appname]
        else:
            appname = appname.replace("raxml-pthreads", "raxml")
            threads = 1
            raxml_bin = conf["app"][appname]

        self.raxml_bin = raxml_bin
        self.threads = threads
        self.seqtype = seqtype

        # Process raxml options
        method = conf[confname].get("_method", "GAMMA").upper()
        if seqtype.lower() == "aa":
            self.model_string =  'PROT%s%s' %(method, model.upper())
            self.model = model 
        elif seqtype.lower() == "nt":
            self.model_string =  'GTR%s' %method
            self.model = "GTR"
        else:
            raise ValueError("Unknown seqtype %s", seqtype)
        #inv = conf[confname].get("pinv", "").upper()
        #freq = conf[confname].get("ebf", "").upper()

        self.init()
Example #6
0
    def __init__(self, nodeid, alg_file, constrain_id, model, seqtype,
                 conf, confname, parts_id=None):
        self.confname = confname
        self.conf = conf
        self.alg_phylip_file = alg_file
        self.constrain_tree = None
        if constrain_id:
            self.constrain_tree = db.get_dataid(constrain_id, DATATYPES.constrain_alg)
        self.alg_basename = basename(self.alg_phylip_file)
        self.seqtype = seqtype
        self.tree_file = ""
        self.model = None
        self.lk = None

        TreeTask.__init__(self, nodeid, "tree", "DummyTree", {}, {})
        self.init()
Example #7
0
def process_task(task, wkname, npr_conf, nodeid2info):
    cogconf, cogclass = npr_conf.cog_selector
    concatconf, concatclass = npr_conf.alg_concatenator
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    splitterconf, splitterclass = npr_conf.tree_splitter
    
    threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid,
                                        task.seqtype, task.ttype)
    cladeid, targets, outgroups = db.get_node_info(threadid, nodeid)

    if not treebuilderclass or task.size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree
    
    if outgroups and len(outgroups) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None
        
    node_info = nodeid2info[nodeid]
    conf = GLOBALS[task.configid]
    new_tasks = []    
    if ttype == "cog_selector":
       
        # Generates a md5 id based on the genetree configuration workflow used
        # for the concat alg task. If something changes, concat alg will change
        # and the associated tree will be rebuilt
        config_blocks = set([wkname])
        for key, value in conf[wkname].iteritems():
            if isinstance(value, list) or  isinstance(value, tuple) \
                    or isinstance(value, set):
                for elem in value:
                    config_blocks.add(elem[1:]) if isinstance(elem, str) and elem.startswith("@") else None
            elif isinstance(value, str):
                config_blocks.add(value[1:]) if value.startswith("@") else None
        config_checksum =  md5(''.join(["[%s]\n%s" %(x, dict_string(conf[x]))
                                        for x in sorted(config_blocks)]))

        # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK
        # Check that current selection of cogs will cover all target and
        # outgroup species
        #cog_hard_limit = int(conf[concatconf]["_max_cogs"])
        #sp_repr = defaultdict(int)
        #for co in task.raw_cogs[:cog_hard_limit]:
        #    for sp, seq in co:
        #        sp_repr[sp] += 1
        #missing_sp = (targets | outgroups) - set(sp_repr.keys())
        #if missing_sp:
        #    raise TaskError("missing species under current cog selection: %s" %missing_sp)
        #else:
        #    log.log(28, "Analysis of current COG selection:")
        #    for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]):
        #        log.log(28, "   % 30s species present in % 6d COGs" %(sp, ncogs))
                
        # register concat alignment task. NodeId associated to concat_alg tasks
        # and all its children jobs should take into account cog information and
        # not only species and outgroups included.
        
        concat_job = concatclass(task.cogs, seqtype, conf, concatconf,
                                 config_checksum)
        db.add_node(threadid,
                    concat_job.nodeid, cladeid,
                    targets, outgroups)

        # Register Tree constrains
        constrain_tree = "(%s, (%s));" %(','.join(sorted(outgroups)), 
                                         ','.join(sorted(targets)))
        _outs = "\n".join(map(lambda name: ">%s\n0" %name, sorted(outgroups)))
        _tars = "\n".join(map(lambda name: ">%s\n1" %name, sorted(targets)))
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit() # since the creation of some Task objects
                             # may require this info, I need to commit
                             # right now.
        concat_job.size = task.size
        new_tasks.append(concat_job)
       
    elif ttype == "concat_alg":
        # register tree for concat alignment, using constraint tree if
        # necessary
        alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip)
        try:
            parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions)
        except ValueError:
            parts_id = None

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = targets
        nodeid2info[nodeid]["out_seqs"] = outgroups
        tree_task = treebuilderclass(nodeid, alg_id,
                                     constrain_id, None,
                                     task.seqtype, conf, treebuilderconf,
                                     parts_id=parts_id)
        tree_task.size = task.size
        new_tasks.append(tree_task)
        
    elif ttype == "tree":
        merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf)
        merger_task.size = task.size
        new_tasks.append(merger_task)

    elif ttype == "treemerger":
        # Lets merge with main tree
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid, runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree

                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(targets), len(outgroups))

                target_cladeids = None
                if tobool(conf[splitterconf].get("_find_ncbi_targets", False)):
                    tcopy = mtree.copy()
                    ncbi.connect_database()
                    tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, None)
                    #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example
                    n2content = tcopy.get_cached_content()
                    broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches(tcopy, n2content)
                    log.log(28, 'restricting NPR to broken clades: '+
                            colorify(', '.join(map(lambda x: "%s"%tax2name[x], broken_clades)), "wr"))
                    target_cladeids = set()
                    for branch in broken_branches:
                        print branch.get_ascii(attributes=['spname', 'taxid'], compact=True)
                        print map(lambda x: "%s"%tax2name[x], broken_branches[branch])
                        target_cladeids.add(branch.cladeid)

                for node, seqs, outs, wkname in get_next_npr_node(task.configid, ttree,
                                                          task.out_seqs, mtree, None,
                                                          npr_conf, target_cladeids): # None is to avoid alg checks
                    log.log(24, "Adding new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = cogclass(seqs, outs,
                                             source_seqtype, conf, cogconf)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
                    db.add_node(threadid,
                                new_task_node.nodeid, new_task_node.cladeid,
                                new_task_node.targets,
                                new_task_node.outgroups)
    return new_tasks
Example #8
0
    def finish(self):
        def euc_dist(x, y):
            return len(x.symmetric_difference(y)) / float((len(x) + len(y)))
        dataid = db.get_dataid(*self.task_tree_file.split("."))
        ttree = PhyloTree(db.get_data(dataid))
        mtree = self.main_tree
        ttree.dist = 0
        cladeid, target_seqs, out_seqs = db.get_node_info(self.threadid, self.nodeid)
        self.out_seqs = out_seqs
        self.target_seqs = target_seqs

        ttree_content = ttree.get_cached_content()
        if mtree and not out_seqs:
            mtree_content = mtree.get_cached_content()
            log.log(24, "Finding best scoring outgroup from previous iteration.")
            for _n in mtree_content:
                if _n.cladeid == cladeid:
                    orig_target = _n 
            target_left = set([_n.name for _n in mtree_content[orig_target.children[0]]])
            target_right = set([_n.name for _n in mtree_content[orig_target.children[1]]])
                    
            partition_pairs = []
            everything = set([_n.name for _n in ttree_content[ttree]])
            for n, content in ttree_content.iteritems():
                if n is ttree:
                    continue
                left = set([_n.name for _n in content])
                right =  everything - left
                d1 = euc_dist(left, target_left)
                d2 = euc_dist(left, target_right)
                best_match = min(d1, d2)
                partition_pairs.append([best_match, left, right, n])

            partition_pairs.sort()
            
            self.outgroup_match_dist = partition_pairs[0][0]
            #self.outgroup_match = '#'.join( ['|'.join(partition_pairs[0][1]),
            #                      '|'.join(partition_pairs[0][2])] )

            
            outgroup = partition_pairs[0][3]
            ttree.set_outgroup(outgroup)
      
            ttree.dist = orig_target.dist
            ttree.support = orig_target.support

            # Merge task and main trees
            parent = orig_target.up
            orig_target.detach()
            parent.add_child(ttree)

        elif mtree and out_seqs:
            log.log(26, "Rooting tree using %d custom seqs" %
                   len(out_seqs))

            self.outgroup_match = '|'.join(out_seqs)
                        
            #log.log(22, "Out seqs:    %s", len(out_seqs))
            #log.log(22, "Target seqs: %s", target_seqs)
            if len(out_seqs) > 1:
                #first root to a single seqs outside the outgroup
                #(should never fail and avoids random outgroup split
                #problems in unrooted trees)
                ttree.set_outgroup(ttree & list(target_seqs)[0])
                # Now tries to get the outgroup node as a monophyletic clade
                outgroup = ttree.get_common_ancestor(out_seqs)
                if set(outgroup.get_leaf_names()) ^ out_seqs:
                    msg = "Monophyly of the selected outgroup could not be granted! Probably constrain tree failed."
                    #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, out_seqs)
                    raise TaskError(self, msg)
            else:
                outgroup = ttree & list(out_seqs)[0]

            ttree.set_outgroup(outgroup)
            orig_target = self.main_tree.get_common_ancestor(target_seqs)
            found_target = outgroup.get_sisters()[0]

            ttree = ttree.get_common_ancestor(target_seqs)
            outgroup.detach()
            self.pre_iter_support = orig_target.support
            # Use previous dist and support
            ttree.dist = orig_target.dist
            ttree.support = orig_target.support
            parent = orig_target.up
            orig_target.detach()
            parent.add_child(ttree)
               
        else:
            # ROOTS FIRST ITERATION
            log.log(24, "Getting outgroup for first NPR split")
            
            # if early split is provided in the command line, it
            # overrides config file
            mainout = GLOBALS.get("first_split_outgroup", "midpoint")
            
            if mainout.lower() == "midpoint":
                log.log(26, "Rooting to midpoint.")
                best_outgroup = ttree.get_midpoint_outgroup()
                if best_outgroup:
                    ttree.set_outgroup(best_outgroup)
                else:
                    log.warning("Midpoint outgroup could not be set!")
                    ttree.set_outgroup(ttree.iter_leaves().next())
            else:
                if mainout.startswith("~"):
                    # Lazy defined outgroup. Will trust in the common
                    # ancestor of two or more OTUs
                    strict_common_ancestor = False
                    outs = set(mainout[1:].split())
                    if len(outs) < 2:          
                        raise TaskError(self, "First split outgroup error: common "
                                        "ancestor calculation requires at least two OTU names")
                else:
                    strict_common_ancestor = True
                    outs = set(mainout.split())

                if outs - target_seqs:
                    raise TaskError(self, "Unknown seqs cannot be used to set first split rooting:%s" %(outs - target_seqs))
                    
                if len(outs) > 1:
                    anchor = list(set(target_seqs) - outs)[0]
                    ttree.set_outgroup(ttree & anchor)
                    common = ttree.get_common_ancestor(outs)
                    out_seqs = common.get_leaf_names()
                    if common is ttree:
                        msg = "First split outgroup could not be granted:%s" %out_seqs
                        #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs)
                        raise TaskError(self, msg)
                    if strict_common_ancestor and set(out_seqs) ^ outs:
                        msg = "Monophyly of first split outgroup could not be granted:%s" %out_seqs
                        #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs)
                        raise TaskError(self, msg)
                    
                    log.log(26, "@@8:First split rooting to %d seqs@@1:: %s" %(len(out_seqs),out_seqs))
                    ttree.set_outgroup(common)
                else:
                    single_out = outs.pop()
                    common = ttree.set_outgroup(single_out)
                    log.log(26, "@@8:First split rooting to 1 seq@@1:: %s" %(single_out))
                    
            self.main_tree = ttree
            orig_target = ttree

        tn = orig_target.copy()
        self.pre_iter_task_tree = tn
        self.rf = orig_target.robinson_foulds(ttree)
        self.pre_iter_support = orig_target.support
                
        # Reloads node2content of the rooted tree and generate cladeids
        ttree_content = self.main_tree.get_cached_content()
        for n, content in ttree_content.iteritems():
            cid = generate_id([_n.name for _n in content])
            n.add_feature("cladeid", cid)

        #ttree.write(outfile=self.pruned_tree)
        self.task_tree = ttree
Example #9
0
    def finish(self):
        # Assumes tasks resulting from genetree workflow, in which
        # only Alg and Acleaner tasks could contain the results
        log.log(26, "Collecting supermatrix data")
        
        # jobtypes = set()
        # job2alg, job2acleaner = {}, {}
        # for job in self.jobs:
        #     jobtypes.add(job.ttype)
        #     if job.ttype == "alg" and job.nodeid not in self.job2alg:
        #         dataid = db.get_dataid(*job.alg_fasta_file.split("."))
        #         job2alg[job.nodeid] = db.get_data(dataid)
        #     elif job.ttype == "acleaner":
        #         a, b =  job.clean_alg_fasta_file.split(".")
        #         dataid = db.get_dataid(*job.clean_alg_fasta_file.split("."))
        #         job2acleaner[job.nodeid] = db.get_data(dataid)
        #     elif job.ttype == "mchooser":
        #         self.job2model[job.nodeid] = job.best_model

        # Let's extract alignments from the tree job in the genetree workflow,
        # so I can make sure I am using the correct version, either raw, trimmed version, or even the
        # switched nt version
        observed_seqtypes = set()
        self.job2alg = {}
        for job in self.jobs:
            if job.ttype == "tree":
                observed_seqtypes.add(job.seqtype)
                taskid, datatype = job.alg_phylip_file.split(".")
                dataid = db.get_dataid(taskid, datatype)
                self.job2alg[job.nodeid] = db.get_data(dataid)
            elif job.ttype == "mchooser":
                self.job2model[job.nodeid] = job.best_model

        # if all alignments are nt, then set it as seqtype for concat alg
        if len(observed_seqtypes) > 1:
            raise TaskError('Mixed data types not supported in super-matrix workflow')
        elif "nt" in observed_seqtypes:
            self.seqtype = "nt"
            self.default_model = self.conf[self.confname]["_default_nt_model"]
        else:
            self.seqtype = "aa"            
            self.default_model = self.conf[self.confname]["_default_aa_model"]

        if self.cog_ids - set(self.job2alg):
            log.error("Missing %s algs", len(self.cog_ids -
                                             set(self.job2alg)))
            missing = self.cog_ids - set(self.job2alg)
            raise TaskError(self, "Missing algs (%d): i.e. %s" %(len(missing),missing[:10]))

        alg_data = [(self.job2alg[nid],
                     self.job2model.get(nid, self.default_model))
                    for nid in self.job2alg]
        filenames, models = zip(*alg_data)

        mainalg, partitions, sp2alg, species, alg_lenghts = get_concatenated_alg(
            filenames,
            models, sp_field=0,
            sp_delimiter=GLOBALS["spname_delimiter"])

        log.log(20, "Done concat alg, now writting fasta format")
        fasta = mainalg.write(format="fasta")
        log.log(20, "Done concat alg, now writting phylip format")
        phylip = mainalg.write(format="iphylip_relaxed")
        txt_partitions = '\n'.join(partitions)
        log.log(26, "Modeled regions: \n"+'\n'.join(partitions))
        ConcatAlg.store_data(self, fasta, phylip, txt_partitions)
Example #10
0
def process_task(task, wkname, npr_conf, nodeid2info):
    alignerconf, alignerclass = npr_conf.aligner
    cleanerconf, cleanerclass = npr_conf.alg_cleaner
    mtesterconf, mtesterclass = npr_conf.model_tester
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    if not treebuilderclass:
        # Allows to dump algs in workflows with no tree tasks
        treebuilderclass = DummyTree
   
    splitterconf, splitterclass = npr_conf.tree_splitter
    
    conf = GLOBALS[task.configid]
    seqtype = task.seqtype
    nodeid = task.nodeid
    ttype = task.ttype
    taskid = task.taskid
    threadid = task.threadid
    node_info = nodeid2info[nodeid]
    size = task.size#node_info.get("size", 0)
    target_seqs = node_info.get("target_seqs", [])
    out_seqs = node_info.get("out_seqs", [])

    if not treebuilderclass or size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree
        mtesterclass = None
        
    # If more than one outgroup are used, enable the use of constrain
    if out_seqs and len(out_seqs) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None
    
    new_tasks = []
    if ttype == "msf":
        # Register Tree constrains
        constrain_tree = "(%s, (%s));" %(','.join(sorted(task.out_seqs)), 
                                         ','.join(sorted(task.target_seqs)))
        _outs = "\n".join(map(lambda name: ">%s\n0" %name, sorted(task.out_seqs)))
        _tars = "\n".join(map(lambda name: ">%s\n1" %name, sorted(task.target_seqs)))
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit() # since the creation of some Task
                               # objects may require this info, I need
                               # to commit right now.

        # Register node
        db.add_node(task.threadid,
                    task.nodeid, task.cladeid,
                    task.target_seqs,
                    task.out_seqs)
       
        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = task.target_seqs
        nodeid2info[nodeid]["out_seqs"] = task.out_seqs
        alg_task = alignerclass(nodeid, task.multiseq_file,
                                seqtype, conf, alignerconf)
        alg_task.size = task.size
        new_tasks.append(alg_task)
       

    elif ttype == "alg" or ttype == "acleaner":
        if ttype == "alg":
            nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file
        elif ttype == "acleaner":
            nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file
        
        alg_fasta_file = getattr(task, "clean_alg_fasta_file",
                                 task.alg_fasta_file)
        alg_phylip_file = getattr(task, "clean_alg_phylip_file",
                                  task.alg_phylip_file)

        # Calculate alignment stats           
        # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file, 
        #                                        conf["app"]["trimal"])
        #  
        # max_identity = get_trimal_identity(task.alg_fasta_file, 
        #                                 conf["app"]["trimal"])
        # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std)
        # log.info("Max. Identity: %0.2f", max_identity)
        #import time
        #t1 = time.time()
        #mx, mn, mean, std = get_identity(task.alg_fasta_file)
        #print time.time()-t1
        #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        #t1 = time.time()

        if seqtype == "aa" and npr_conf.switch_aa_similarity < 1:
            try:
                alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats) 
            except Exception, e:
                alg_stats = {}

            if ttype == "alg":
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                dataid = DATATYPES.alg_phylip
            elif ttype == "acleaner":
                algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file)
                dataid = DATATYPES.clean_alg_phylip

            if "i_mean" not in alg_stats:
                log.log(24, "Calculating alignment stats...")
                # dump data if necesary
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                if not pexist(algfile): 
                    # dump phylip alg
                    open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid))) 

                mx, mn, mean, std = get_statal_identity(algfile,
                                                        conf["app"]["statal"])
                alg_stats = {"i_max":mx, "i_mean":mean, "i_min":mn, "i_std":std}
                db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats)

            log.log(22, "Alignment stats (sequence similarity):")
            log.log(22, "   max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" %
                    (alg_stats))

        else:
            alg_stats = {"i_max":-1, "i_mean":-1, "i_min":-1, "i_std":-1}
        
        #print time.time()-t1
        #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        task.max_ident = alg_stats["i_max"]
        task.min_ident = alg_stats["i_min"]
        task.mean_ident = alg_stats["i_mean"]
        task.std_ident = alg_stats["i_std"]
        next_task = None

        if ttype == "alg" and cleanerclass:
            next_task = cleanerclass(nodeid, seqtype, alg_fasta_file,
                                     alg_phylip_file,
                                     conf, cleanerconf)
        else: 
            # Converts aa alignment into nt if necessary
            if  seqtype == "aa" and \
                    "nt" in GLOBALS["seqtypes"] and \
                    task.mean_ident >= npr_conf.switch_aa_similarity:
                log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\
                        (task.mean_ident, npr_conf.switch_aa_similarity))
                alg_fasta_file = "%s.%s" %(taskid, DATATYPES.alg_nt_fasta)
                alg_phylip_file = "%s.%s" %(taskid, DATATYPES.alg_nt_phylip)
                try:
                    alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta)
                    alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip)
                except ValueError:
                    log.log(22, "Calculating codon alignment...")

                    source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file)
                    if ttype == "alg":
                        kept_columns = []
                    elif ttype == "acleaner":
                        # if original alignment was trimmed, use it as reference
                        # but make the nt alignment only on the kept columns
                        kept_columns = db.get_task_data(taskid, DATATYPES.kept_alg_columns)

                    if not pexist(source_alg):
                        open(source_alg, "w").write(db.get_task_data(taskid, DATATYPES.alg_fasta)) 

                    nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns)
                    db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write())
                    db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed'))

                npr_conf = IterConfig(conf, wkname, task.size, "nt")
                seqtype = "nt"
                                          
            if mtesterclass:
                next_task = mtesterclass(nodeid, alg_fasta_file,
                                         alg_phylip_file,
                                         constrain_id,
                                         conf, mtesterconf)
            elif treebuilderclass:
                next_task = treebuilderclass(nodeid, alg_phylip_file,
                                             constrain_id,
                                             None, seqtype,
                                             conf, treebuilderconf)
        if next_task:
            next_task.size = task.size
            new_tasks.append(next_task)
Example #11
0
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender):    
    # Adjust debug mode
    if debug == "all":
        log.setLevel(10)
    pending_tasks = set(pending_tasks)
    
    ## ===================================
    ## INITIALIZE BASIC VARS 
    execution, run_detached = execution
    thread2tasks = defaultdict(list)
    for task in pending_tasks:
        thread2tasks[task.configid].append(task)
    expected_threads = set(thread2tasks.keys())
    past_threads = {}
    thread_errors = defaultdict(list)
    ## END OF VARS AND SHORTCUTS
    ## ===================================

    cores_total = GLOBALS["_max_cores"]
    if cores_total > 0:
        job_queue = Queue()
        
        back_launcher = Process(target=background_job_launcher,
                                args=(job_queue, run_detached,
                                      GLOBALS["launch_time"], cores_total))
        back_launcher.start()
    else:
        job_queue = None
        back_launcher = None

    GLOBALS["_background_scheduler"] = back_launcher
    GLOBALS["_job_queue"] = job_queue

        
    # Captures Ctrl-C for debuging DEBUG 
    #signal.signal(signal.SIGINT, control_c)
    

    
    last_report_time = None
    
    BUG = set()
    try:
        # Enters into task scheduling
        while pending_tasks:
            wtime = schedule_time

            # ask SGE for running jobs
            if execution == "sge":
                sgeid2jobs = db.get_sge_tasks()
                qstat_jobs = sge.qstat()
            else:
                qstat_jobs = None

            # Show summary of pending tasks per thread
            thread2tasks = defaultdict(list)
            for task in pending_tasks:
                thread2tasks[task.configid].append(task)
            set_logindent(0)
            log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime()))
            info_lines = []
            for tid, tlist in thread2tasks.iteritems():
                threadname = GLOBALS[tid]["_name"]
                sizelist = ["%s" %getattr(_ts, "size", "?") for _ts in tlist]
                info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" %(
                    threadname, len(tlist), ', '.join(sizelist))
                info_lines.append(info)

            for line in info_lines:
                log.log(28, line)

            if GLOBALS["email"]  and last_report_time is None:
                last_report_time = time()
                send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines))

            ## ================================
            ## CHECK AND UPDATE CURRENT TASKS
            checked_tasks = set()
            check_start_time = time()
            to_add_tasks = set()

            GLOBALS["cached_status"] = {}
            for task in sorted(pending_tasks, sort_tasks):
                # Avoids endless periods without new job submissions
                elapsed_time = time() - check_start_time
                #if not back_launcher and pending_tasks and \
                #        elapsed_time > schedule_time * 2:
                #    log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:")
                #    db.commit()
                #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
                #                        execution, run_detached)
                #    check_start_time = time()

                # Enter debuging mode if necessary
                if debug and log.level > 10 and task.taskid.startswith(debug):
                    log.setLevel(10) 
                    log.debug("ENTERING IN DEBUGGING MODE")
                thread2tasks[task.configid].append(task)

                # Update tasks and job statuses

                if task.taskid not in checked_tasks:
                    try:
                        show_task_info(task)
                        task.status = task.get_status(qstat_jobs)
                        db.dataconn.commit()
                        if back_launcher and task.status not in set("DE"):
                            for j, cmd in task.iter_waiting_jobs():
                                j.status = "Q"
                                GLOBALS["cached_status"][j.jobid] = "Q"
                                if j.jobid not in BUG:
                                    if not os.path.exists(j.jobdir):
                                        os.makedirs(j.jobdir)
                                    for ifile, outpath in j.input_files.iteritems():
                                        try:
                                            _tid, _did = ifile.split(".")
                                            _did = int(_did)
                                        except (IndexError, ValueError): 
                                            dataid = ifile
                                        else:
                                            dataid = db.get_dataid(_tid, _did)

                                        if not outpath:
                                            outfile = pjoin(GLOBALS["input_dir"], ifile)
                                        else:
                                            outfile = pjoin(outpath, ifile)

                                        if not os.path.exists(outfile): 
                                            open(outfile, "w").write(db.get_data(dataid))

                                    log.log(24, "  @@8:Queueing @@1: %s from %s" %(j, task))
                                    job_queue.put([j.jobid, j.cores, cmd, j.status_file])
                                BUG.add(j.jobid)

                        update_task_states_recursively(task)
                        db.commit()
                        checked_tasks.add(task.taskid)
                    except TaskError, e:
                        log.error("Errors found in %s" %task)
                        import traceback
                        traceback.print_exc()
                        if GLOBALS["email"]:
                            threadname = GLOBALS[task.configid]["_name"]
                            send_mail(GLOBALS["email"], "Errors found in %s!" %threadname,
                                      '\n'.join(map(str, [task, e.value, e.msg])))
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append([task, e.value, e.msg])
                        continue
                else:
                    # Set temporary Queued state to avoids launching
                    # jobs from clones
                    task.status = "Q" 
                    if log.level < 24:
                        show_task_info(task)

                if task.status == "D":
                    #db.commit()
                    show_task_info(task)
                    logindent(3)


                    # Log commands of every task
                    if 'cmd_log_file' not in GLOBALS[task.configid]:
                         GLOBALS[task.configid]['cmd_log_file'] = pjoin(GLOBALS[task.configid]["_outpath"], "cmd.log")
                         O = open(GLOBALS[task.configid]['cmd_log_file'], "w")
                         O.close()

                    cmd_lines =  get_cmd_log(task)
                    CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a")
                    print >>CMD_LOG, task
                    for c in cmd_lines:
                        print >>CMD_LOG, '   '+'\t'.join(map(str, c))
                    CMD_LOG.close()
                    # 

                    try:
                        #wkname = GLOBALS[task.configid]['_name']
                        create_tasks = workflow_task_processor(task, task.target_wkname)
                    except TaskError, e:
                        log.error("Errors found in %s" %task)
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append([task, e.value, e.msg])
                        continue
                    else: 
                        logindent(-3)

                        to_add_tasks.update(create_tasks)
                        pending_tasks.discard(task)

                elif task.status == "E":
                    log.error("task contains errors: %s " %task)
                    log.error("Errors found in %s")
                    pending_tasks.discard(task)
                    thread_errors[task.configid].append([task, None, "Found (E) task status"])