Beispiel #1
0
def pipeline(task, wkname, conf=None):
    logindent(2)

    if not task:  # in this case, conf is expected
        source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
        all_seqs = GLOBALS["target_sequences"]
        initial_task = Msf(set(all_seqs), set(), seqtype=source_seqtype)

        initial_task.main_tree = None
        initial_task.threadid = generate_runid()
        initial_task.configid = initial_task.threadid
        initial_task.target_wkname = wkname
        # Register node
        db.add_node(initial_task.threadid, initial_task.nodeid,
                    initial_task.cladeid, initial_task.target_seqs,
                    initial_task.out_seqs)

        new_tasks = [initial_task]
    else:
        conf = GLOBALS[task.configid]
        npr_conf = IterConfig(conf, wkname, task.size, task.seqtype)
        new_tasks = process_task(task, wkname, npr_conf, conf["_nodeinfo"])

    process_new_tasks(task, new_tasks, conf)
    logindent(-2)

    return new_tasks
Beispiel #2
0
    def load_jobs(self):
        # I want a single phylognetic tree for each cog
        from ete3.tools.phylobuild_lib.workflow.genetree import pipeline

        for co in self.cogs:
            # Register a new msf task for each COG, using the same
            # config file but opening an new tree reconstruction
            # thread.
            job = Msf(set(co), set(), seqtype=self.seqtype)
            job.main_tree = None
            job.threadid = generate_runid()
            job.configid = self.conf["_configid"]
            # This converts the job in a workflow job. As soon as a
            # task is done, it will be automatically processed and the
            # new tasks will be registered as new jobs.
            job.task_processor = pipeline
            job.target_wkname = self.genetree_workflow
            self.jobs.append(job)
            self.cog_ids.add(job.nodeid)
Beispiel #3
0
def process_task(task, wkname, npr_conf, nodeid2info):
    alignerconf, alignerclass = npr_conf.aligner
    cleanerconf, cleanerclass = npr_conf.alg_cleaner
    mtesterconf, mtesterclass = npr_conf.model_tester
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    if not treebuilderclass:
        # Allows to dump algs in workflows with no tree tasks
        treebuilderclass = DummyTree

    splitterconf, splitterclass = npr_conf.tree_splitter

    conf = GLOBALS[task.configid]
    seqtype = task.seqtype
    nodeid = task.nodeid
    ttype = task.ttype
    taskid = task.taskid
    threadid = task.threadid
    node_info = nodeid2info[nodeid]
    size = task.size  #node_info.get("size", 0)
    target_seqs = node_info.get("target_seqs", [])
    out_seqs = node_info.get("out_seqs", [])

    if not treebuilderclass or size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree

    # If more than one outgroup are used, enable the use of constrain
    if out_seqs and len(out_seqs) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None

    new_tasks = []
    if ttype == "msf":
        # Register Tree constrains
        constrain_tree = "(%s, (%s));" % (','.join(sorted(
            task.out_seqs)), ','.join(sorted(task.target_seqs)))
        _outs = "\n".join([">%s\n0" % name for name in sorted(task.out_seqs)])
        _tars = "\n".join(
            [">%s\n1" % name for name in sorted(task.target_seqs)])
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit()  # since the creation of some Task
        # objects may require this info, I need
        # to commit right now.

        # Register node
        db.add_node(task.threadid, task.nodeid, task.cladeid, task.target_seqs,
                    task.out_seqs)

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = task.target_seqs
        nodeid2info[nodeid]["out_seqs"] = task.out_seqs
        alg_task = alignerclass(nodeid, task.multiseq_file, seqtype, conf,
                                alignerconf)
        alg_task.size = task.size
        new_tasks.append(alg_task)

    elif ttype == "alg" or ttype == "acleaner":
        if ttype == "alg":
            nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file
        elif ttype == "acleaner":
            nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file

        alg_fasta_file = getattr(task, "clean_alg_fasta_file",
                                 task.alg_fasta_file)
        alg_phylip_file = getattr(task, "clean_alg_phylip_file",
                                  task.alg_phylip_file)

        # Calculate alignment stats
        # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file,
        #                                        conf["app"]["trimal"])
        #
        # max_identity = get_trimal_identity(task.alg_fasta_file,
        #                                 conf["app"]["trimal"])
        # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std)
        # log.info("Max. Identity: %0.2f", max_identity)
        #import time
        #t1 = time.time()
        #mx, mn, mean, std = get_identity(task.alg_fasta_file)
        #print time.time()-t1
        #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        #t1 = time.time()

        if seqtype == "aa" and npr_conf.switch_aa_similarity < 1:
            try:
                alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats)
            except Exception as e:
                alg_stats = {}

            if ttype == "alg":
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                dataid = DATATYPES.alg_phylip
            elif ttype == "acleaner":
                algfile = pjoin(GLOBALS["input_dir"],
                                task.clean_alg_phylip_file)
                dataid = DATATYPES.clean_alg_phylip

            if "i_mean" not in alg_stats:
                log.log(24, "Calculating alignment stats...")
                # dump data if necesary
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                if not pexist(algfile):
                    # dump phylip alg
                    open(algfile,
                         "w").write(db.get_data(db.get_dataid(taskid, dataid)))

                mx, mn, mean, std = get_statal_identity(
                    algfile, conf["app"]["statal"])
                alg_stats = {
                    "i_max": mx,
                    "i_mean": mean,
                    "i_min": mn,
                    "i_std": std
                }
                db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats)

            log.log(22, "Alignment stats (sequence similarity):")
            log.log(
                22,
                "   max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f"
                % (alg_stats))

        else:
            alg_stats = {"i_max": -1, "i_mean": -1, "i_min": -1, "i_std": -1}

        #print time.time()-t1
        #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        task.max_ident = alg_stats["i_max"]
        task.min_ident = alg_stats["i_min"]
        task.mean_ident = alg_stats["i_mean"]
        task.std_ident = alg_stats["i_std"]
        next_task = None

        if ttype == "alg" and cleanerclass:
            next_task = cleanerclass(nodeid, seqtype, alg_fasta_file,
                                     alg_phylip_file, conf, cleanerconf)
        else:
            # Converts aa alignment into nt if necessary
            if  seqtype == "aa" and \
                    "nt" in GLOBALS["seqtypes"] and \
                    task.mean_ident >= npr_conf.switch_aa_similarity:
                log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\
                        (task.mean_ident, npr_conf.switch_aa_similarity))
                alg_fasta_file = "%s.%s" % (taskid, DATATYPES.alg_nt_fasta)
                alg_phylip_file = "%s.%s" % (taskid, DATATYPES.alg_nt_phylip)
                try:
                    alg_fasta_file = db.get_dataid(taskid,
                                                   DATATYPES.alg_nt_fasta)
                    alg_fasta_file = db.get_dataid(taskid,
                                                   DATATYPES.alg_nt_phylip)
                except ValueError:
                    log.log(22, "Calculating codon alignment...")

                    source_alg = pjoin(GLOBALS["input_dir"],
                                       task.alg_fasta_file)
                    if ttype == "alg":
                        kept_columns = []
                    elif ttype == "acleaner":
                        # if original alignment was trimmed, use it as reference
                        # but make the nt alignment only on the kept columns
                        kept_columns = db.get_task_data(
                            taskid, DATATYPES.kept_alg_columns)

                    if not pexist(source_alg):
                        open(source_alg, "w").write(
                            db.get_task_data(taskid, DATATYPES.alg_fasta))

                    nt_alg = switch_to_codon(source_alg,
                                             kept_columns=kept_columns)
                    db.add_task_data(taskid, DATATYPES.alg_nt_fasta,
                                     nt_alg.write())
                    db.add_task_data(taskid, DATATYPES.alg_nt_phylip,
                                     nt_alg.write(format='iphylip_relaxed'))

                npr_conf = IterConfig(conf, wkname, task.size, "nt")
                seqtype = "nt"

            if mtesterclass:
                next_task = mtesterclass(nodeid, alg_fasta_file,
                                         alg_phylip_file, constrain_id, conf,
                                         mtesterconf)
            elif treebuilderclass:
                next_task = treebuilderclass(nodeid, alg_phylip_file,
                                             constrain_id, None, seqtype, conf,
                                             treebuilderconf)
        if next_task:
            next_task.size = task.size
            new_tasks.append(next_task)

    elif ttype == "mchooser":
        if treebuilderclass:
            alg_fasta_file = task.alg_fasta_file
            alg_phylip_file = task.alg_phylip_file
            model = task.best_model
            tree_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id,
                                         model, seqtype, conf, treebuilderconf)
            tree_task.size = task.size
            new_tasks.append(tree_task)

    elif ttype == "tree":
        treemerge_task = splitterclass(nodeid, seqtype, task.tree_file, conf,
                                       splitterconf)
        #if conf["tree_splitter"]["_outgroup_size"]:
        #    treemerge_task = TreeSplitterWithOutgroups(nodeid, seqtype, task.tree_file, main_tree, conf)
        #else:
        #    treemerge_task = TreeSplitter(nodeid, seqtype, task.tree_file, main_tree, conf)

        treemerge_task.size = task.size
        new_tasks.append(treemerge_task)

    elif ttype == "treemerger":
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid,
                       runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass,
                          DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree
                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(target_seqs), len(out_seqs))
                alg_path = node_info.get("clean_alg_path",
                                         node_info["alg_path"])
                for node, seqs, outs, wkname in get_next_npr_node(
                        threadid, ttree, task.out_seqs, mtree, alg_path,
                        npr_conf):
                    log.log(24, "Registering new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = Msf(seqs, outs, seqtype=source_seqtype)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
    return new_tasks