Exemple #1
0
    def finish(self):
        lks = []
        if self.lk_mode == "phyml":
            for job in self.jobs:
                if job.jobcat != "bionj": continue
                phyml_job = job
                tree_file = pjoin(phyml_job.jobdir,
                                  self.alg_phylip_file+"_phyml_tree.txt")
                stats_file = pjoin(phyml_job.jobdir,
                                   self.alg_phylip_file+"_phyml_stats.txt")
                tree = PhyloTree(tree_file)
                m = re.search('Log-likelihood:\s+(-?\d+\.\d+)',
                              open(stats_file).read())
                lk = float(m.groups()[0])
                tree.add_feature("lk", lk)
                tree.add_feature("model", phyml_job.args["--model"])
                lks.append([float(tree.lk), tree.model, tree])
        elif self.lk_mode == "raxml":
            for job in self.jobs:
                if job.jobcat != "raxml": continue
                raxml_job = job
                lk = open(pjoin(raxml_job.jobdir, "RAxML_log.%s"
                                %raxml_job.args["-n"])).readline().split()[1]
                tree = PhyloTree(raxml_job.args["-t"])
                tree.add_feature("lk", lk)
                tree.add_feature("model", raxml_job.model)
                lks.append([float(tree.lk), tree.model, tree])

        # sort lks in ASC order
        lks.sort()
        # choose the model with higher likelihood, the lastone in the list
        best_model = lks[-1][1]
        best_tree = lks[-1][2]
        log.log(22, "%s model selected from the following lk values:\n%s" %(best_model, '\n'.join(map(str, lks))))
        ModelTesterTask.store_data(self, best_model, lks)
Exemple #2
0
    def load_jobs(self):
        conf = self.conf
        for m in self.models:
            args = self.args.copy()
            args["--model"] = m
            bionj_job = Job(conf["app"]["phyml"],
                            args,
                            parent_ids=[self.nodeid])
            bionj_job.jobname += "-bionj-" + m
            bionj_job.jobcat = "bionj"
            bionj_job.add_input_file(self.alg_phylip_file, bionj_job.jobdir)
            self.jobs.append(bionj_job)

            if self.lk_mode == "raxml":
                raxml_args = {
                    "-f":
                    "e",
                    "-s":
                    pjoin(bionj_job.jobdir, self.alg_phylip_file),
                    "-m":
                    "PROTGAMMA%s" % m,
                    "-n":
                    self.alg_phylip_file + "." + m,
                    "-t":
                    pjoin(bionj_job.jobdir,
                          self.alg_phylip_file + "_phyml_tree.txt")
                }
                raxml_job = Job(conf["app"]["raxml"],
                                raxml_args,
                                parent_ids=[bionj_job.jobid])
                raxml_job.jobname += "-lk-optimize"
                raxml_job.dependencies.add(bionj_job)
                raxml_job.model = m
                raxml_job.jobcat = "raxml"
                self.jobs.append(raxml_job)
Exemple #3
0
    def load_jobs(self):
        conf = self.conf
        for m in self.models:
            args = self.args.copy()
            args["--model"] = m
            bionj_job = Job(conf["app"]["phyml"], args,
                      parent_ids=[self.nodeid])
            bionj_job.jobname += "-bionj-" + m
            bionj_job.jobcat = "bionj"
            bionj_job.add_input_file(self.alg_phylip_file, bionj_job.jobdir)
            self.jobs.append(bionj_job)

            if self.lk_mode == "raxml":
                raxml_args = {
                    "-f": "e",
                    "-s": pjoin(bionj_job.jobdir, self.alg_phylip_file),
                    "-m": "PROTGAMMA%s" % m,
                    "-n": self.alg_phylip_file+"."+m,
                    "-t": pjoin(bionj_job.jobdir,
                                       self.alg_phylip_file+"_phyml_tree.txt")
                    }
                raxml_job = Job(conf["app"]["raxml"], raxml_args,
                                parent_ids=[bionj_job.jobid])
                raxml_job.jobname += "-lk-optimize"
                raxml_job.dependencies.add(bionj_job)
                raxml_job.model = m
                raxml_job.jobcat = "raxml"
                self.jobs.append(raxml_job)
Exemple #4
0
def zdecode(x):
    if x.startswith("__DBDIR__:"):
        data_id = x.split(':', 1)[1]
        data = six.moves.cPickle.load(open(pjoin(GLOBALS['db_dir'], data_id+".pkl")))
    else:
        data = six.moves.cPickle.loads(zlib.decompress(base64.decodestring(x)))
    return data
Exemple #5
0
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format. Both files, fasta and phylip,
        # remain accessible.

        # Set Task specific attributes
        main_job = self.jobs[0]
        fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta")
        alg = SeqGroup(fasta_path)
        if len(alg) != self.size:
            log.warning(
                "Trimming was to aggressive and it tried"
                " to remove one or more sequences."
                " Alignment trimming will be disabled for this dataset.")
            self.clean_alg_fasta_file = db.register_task_data(
                self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file)
            self.clean_alg_phylip_file = db.register_task_data(
                self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file)
        else:
            for line in open(self.jobs[0].stdout_file):
                line = line.strip()
                if line.startswith("#ColumnsMap"):
                    kept_columns = list(
                        map(int,
                            line.split("\t")[1].split(",")))
            fasta = alg.write(format="fasta")
            phylip = alg.write(format="iphylip_relaxed")
            AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
Exemple #6
0
 def load_jobs(self):
     appname = self.conf[self.confname]["_app"]
     args = self.args.copy()
     args["-in"] = pjoin(GLOBALS["input_dir"], self.alg_fasta_file)
     args["-out"] = "clean.alg.fasta"
     job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
     job.add_input_file(self.alg_fasta_file)
     self.jobs.append(job)
Exemple #7
0
def zdecode(x):
    if x.startswith("__DBDIR__:"):
        data_id = x.split(':', 1)[1]
        data = six.moves.cPickle.load(
            open(pjoin(GLOBALS['db_dir'], data_id + ".pkl")))
    else:
        data = six.moves.cPickle.loads(zlib.decompress(base64.decodestring(x)))
    return data
Exemple #8
0
 def load_jobs(self):
     appname = self.conf[self.confname]["_app"]
     args = self.args.copy()
     args["-in"] = pjoin(GLOBALS["input_dir"], self.alg_fasta_file)
     args["-out"] = "clean.alg.fasta"
     job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
     job.add_input_file(self.alg_fasta_file)
     self.jobs.append(job)
Exemple #9
0
def zencode(x, data_id):
    pdata = six.moves.cPickle.dumps(x)
    if sys.getsizeof(pdata) > MAX_SQLITE_SIZE:
        # using protocol 2 fails because of the integer overflow python bug
        # i.e. http://bugs.python.org/issue13555
        six.moves.cPickle.dump(x, open(pjoin(GLOBALS['db_dir'], data_id+".pkl"), "wb"))
        return "__DBDIR__:%s" %data_id
    else:
        return base64.encodestring(zlib.compress(pdata))
Exemple #10
0
 def load_jobs(self):
     # Only one Muscle job is necessary to run this task
     appname = self.conf[self.confname]["_app"]
     args = OrderedDict(self.args)
     args["-in"] = pjoin(GLOBALS["input_dir"], self.multiseq_file)
     args["-out"] = "alg.fasta"
     job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
     job.add_input_file(self.multiseq_file)
     self.jobs.append(job)
Exemple #11
0
 def load_jobs(self):
     # Only one Muscle job is necessary to run this task
     appname = self.conf[self.confname]["_app"]
     args = OrderedDict(self.args)
     args[''] = "%s %s" % (pjoin(GLOBALS["input_dir"],
                                 self.multiseq_file), "alg.fasta")
     job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
     job.add_input_file(self.multiseq_file)
     self.jobs.append(job)
Exemple #12
0
 def load_jobs(self):
     appname = self.conf[self.confname]["_app"]
     # Only one Muscle job is necessary to run this task
     args = OrderedDict(self.args)
     args["-i"] = pjoin(GLOBALS["input_dir"], self.multiseq_file)
     args["-o"] = "alg.fasta"
     job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
     job.cores = self.conf["threading"].get(appname, 1)
     job.add_input_file(self.multiseq_file)
     self.jobs.append(job)
Exemple #13
0
def zencode(x, data_id):
    pdata = six.moves.cPickle.dumps(x)
    if sys.getsizeof(pdata) > MAX_SQLITE_SIZE:
        # using protocol 2 fails because of the integer overflow python bug
        # i.e. http://bugs.python.org/issue13555
        six.moves.cPickle.dump(
            x, open(pjoin(GLOBALS['db_dir'], data_id + ".pkl"), "wb"))
        return "__DBDIR__:%s" % data_id
    else:
        return base64.encodestring(zlib.compress(pdata))
Exemple #14
0
 def load_jobs(self):
     appname = self.conf[self.confname]["_app"]
     args = OrderedDict(self.args)
     # Mafft redirects resulting alg to std.output. The order of
     # arguments is important, input file must be the last
     # one.
     args[""] = pjoin(GLOBALS["input_dir"], self.multiseq_file)
     job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
     job.add_input_file(self.multiseq_file)
     job.cores = self.conf["threading"][appname]
     self.jobs.append(job)
Exemple #15
0
    def load_jobs(self):
        args = self.args.copy()
        args["-outfile"] = "mcoffee.fasta"

        alg_paths = [pjoin(GLOBALS["input_dir"], algid)
                     for algid in self.all_alg_files]
        args["-aln"] = ' '.join(alg_paths)
        job = Job(self.conf["app"]["tcoffee"], args, parent_ids=self.parent_ids)
        for key in self.all_alg_files:
            job.add_input_file(key)
        self.jobs.append(job)
Exemple #16
0
def app_wrapper(func, args):
    global NCURSES
    base_dir = GLOBALS.get("scratch_dir", GLOBALS["basedir"])
    lock_file = pjoin(base_dir, "alive")

    if not args.enable_ui:
        NCURSES = False

    if not pexist(lock_file) or args.clearall:
        open(lock_file, "w").write(time.ctime())
    else:
        clear_env()
        print('\nThe same process seems to be running. Use --clearall or remove the lock file "alive" within the output dir', file=sys.stderr)
        sys.exit(-1)

    try:
        if NCURSES:
            curses.wrapper(main, func, args)
        else:
            main(None, func, args)
    except ConfigError as e:
        if GLOBALS.get('_background_scheduler', None):
            GLOBALS['_background_scheduler'].terminate()

        print("\nConfiguration Error:", e, file=sys.stderr)
        clear_env()
        sys.exit(-1)
    except DataError as e:
        if GLOBALS.get('_background_scheduler', None):
            GLOBALS['_background_scheduler'].terminate()

        print("\nData Error:", e, file=sys.stderr)
        clear_env()
        sys.exit(-1)
    except KeyboardInterrupt:
        # Control-C is also grabbed by the back_launcher, so it is no necessary
        # to terminate from here
        print("\nProgram was interrupted.", file=sys.stderr)
        if args.monitor:
            print(("VERY IMPORTANT !!!: Note that launched"
                                 " jobs will keep running as you provided the --monitor flag"), file=sys.stderr)
        clear_env()
        sys.exit(-1)
    except:
        if GLOBALS.get('_background_scheduler', None):
            GLOBALS['_background_scheduler'].terminate()

        clear_env()
        raise
    else:
        if GLOBALS.get('_background_scheduler', None):
            GLOBALS['_background_scheduler'].terminate()

        clear_env()
Exemple #17
0
    def load_jobs(self):
        args = self.args.copy()

        try:
            del args["-wag"]
        except KeyError:
            pass

        if self.constrain_tree:
            args["-constraints"] = pjoin(GLOBALS["input_dir"], self.constrain_tree)

        args[pjoin(GLOBALS["input_dir"], self.alg_phylip_file)] = ""
        appname = self.conf[self.confname]["_app"]

        job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
        job.cores = self.conf["threading"][appname]
        if self.constrain_tree:
            job.add_input_file(self.constrain_tree)
        job.add_input_file(self.alg_phylip_file)
        self.jobs.append(job)
Exemple #18
0
 def load_jobs(self):
     appname = self.conf[self.confname]["_app"]
     args = OrderedDict(self.args)
     # Mafft redirects resulting alg to std.output. The order of
     # arguments is important, input file must be the last
     # one.
     args[""] = pjoin(GLOBALS["input_dir"], self.multiseq_file)
     job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
     job.add_input_file(self.multiseq_file)
     job.cores = self.conf["threading"][appname]
     self.jobs.append(job)
Exemple #19
0
    def load_jobs(self):
        args = self.args.copy()

        try:
            del args["-wag"]
        except KeyError:
            pass

        if self.constrain_tree:
            args["-constraints"] = pjoin(GLOBALS["input_dir"],
                                         self.constrain_tree)

        args[pjoin(GLOBALS["input_dir"], self.alg_phylip_file)] = ""
        appname = self.conf[self.confname]["_app"]

        job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid])
        job.cores = self.conf["threading"][appname]
        if self.constrain_tree:
            job.add_input_file(self.constrain_tree)
        job.add_input_file(self.alg_phylip_file)
        self.jobs.append(job)
Exemple #20
0
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format.
        alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "mcoffee.fasta"))
        fasta = alg.write(format="fasta")
        phylip = alg.write(format="iphylip_relaxed")

        alg_list_string = '\n'.join([pjoin(GLOBALS["input_dir"],
                                           aname) for aname in self.all_alg_files])
        db.add_task_data(self.taskid, DATATYPES.alg_list, alg_list_string)

        AlgTask.store_data(self, fasta, phylip)
Exemple #21
0
    def finish(self):
        lks = []
        if self.lk_mode == "phyml":
            for job in self.jobs:
                if job.jobcat != "bionj": continue
                phyml_job = job
                tree_file = pjoin(phyml_job.jobdir,
                                  self.alg_phylip_file + "_phyml_tree.txt")
                stats_file = pjoin(phyml_job.jobdir,
                                   self.alg_phylip_file + "_phyml_stats.txt")
                tree = PhyloTree(tree_file)
                m = re.search('Log-likelihood:\s+(-?\d+\.\d+)',
                              open(stats_file).read())
                lk = float(m.groups()[0])
                tree.add_feature("lk", lk)
                tree.add_feature("model", phyml_job.args["--model"])
                lks.append([float(tree.lk), tree.model, tree])
        elif self.lk_mode == "raxml":
            for job in self.jobs:
                if job.jobcat != "raxml": continue
                raxml_job = job
                lk = open(
                    pjoin(raxml_job.jobdir, "RAxML_log.%s" %
                          raxml_job.args["-n"])).readline().split()[1]
                tree = PhyloTree(raxml_job.args["-t"])
                tree.add_feature("lk", lk)
                tree.add_feature("model", raxml_job.model)
                lks.append([float(tree.lk), tree.model, tree])

        # sort lks in ASC order
        lks.sort()
        # choose the model with higher likelihood, the lastone in the list
        best_model = lks[-1][1]
        best_tree = lks[-1][2]
        log.log(
            22, "%s model selected from the following lk values:\n%s" %
            (best_model, '\n'.join(map(str, lks))))
        ModelTesterTask.store_data(self, best_model, lks)
Exemple #22
0
def clear_env():
    try:
        terminate_job_launcher()
    except:
        pass

    base_dir = GLOBALS["basedir"]
    lock_file = pjoin(base_dir, "alive")
    try:
        os.remove(lock_file)
    except Exception:
        print("could not remove lock file %s" %lock_file, file=sys.stderr)

    clear_tempdir()
Exemple #23
0
 def finish(self):
     if self.conf[self.confname]["_alg_trimming"]:
         # If trimming happened after mcoffee, let's save the
         # resulting output
         trim_job = self.jobs[-1]
         alg = SeqGroup(pjoin(trim_job.jobdir, trim_job.alg_fasta_file))
         fasta = alg.write(format="fasta")
         phylip = alg.write(format="iphylip_relaxed")
         AlgTask.store_data(self, fasta, phylip)
     else:
         # If no post trimming, output is just what Mcoffee
         # produced, so we can recycle its data ids.
         mc_task = self.jobs[-1]
         fasta_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_fasta)
         phylip_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_phylip)
         db.register_task_data(self.taskid, DATATYPES.alg_fasta, fasta_id)
         db.register_task_data(self.taskid, DATATYPES.alg_phylip, phylip_id)
Exemple #24
0
def dump_tree_debug(msg, taskdir, mtree, ttree, target_seqs, out_seqs):
    try:
        if out_seqs is None: out_seqs = set()
        if target_seqs is None: target_seqs = set()
        if ttree:
            for n in ttree.get_leaves():
                if n.name in out_seqs:
                    n.name = n.name + " *__OUTGROUP__*"
        if mtree:
            for n in mtree.get_leaves():
                if n.name in out_seqs:
                    n.name = n.name + " *__OUTGROUP__*"
                if n.name in target_seqs:
                    n.name = n.name + " [ TARGET ]"

        OUT = open(pjoin(taskdir, "__debug__"), "w")
        print(msg, file=OUT)
        print("MainTree:", mtree, file=OUT)
        print("TaskTree:", ttree, file=OUT)
        print("Expected outgroups:", out_seqs, file=OUT)
        OUT.close()
    except Exception as e:
        print(e)
Exemple #25
0
def dump_tree_debug(msg, taskdir, mtree, ttree, target_seqs, out_seqs):
    try:
        if out_seqs is None: out_seqs = set()
        if target_seqs is None: target_seqs = set()
        if ttree:
            for n in ttree.get_leaves():
                if n.name in out_seqs:
                    n.name = n.name + " *__OUTGROUP__*"
        if mtree:
            for n in mtree.get_leaves():
                if n.name in out_seqs:
                    n.name = n.name + " *__OUTGROUP__*"
                if n.name in target_seqs:
                    n.name = n.name + " [ TARGET ]"

        OUT = open(pjoin(taskdir, "__debug__"), "w")
        print(msg, file=OUT)
        print("MainTree:", mtree, file=OUT)
        print("TaskTree:", ttree, file=OUT)
        print("Expected outgroups:", out_seqs, file=OUT)
        OUT.close()
    except Exception as e:
        print(e)
Exemple #26
0
    def finish(self):
        # Once executed, alignment is converted into relaxed
        # interleaved phylip format. Both files, fasta and phylip,
        # remain accessible.

        # Set Task specific attributes
        main_job = self.jobs[0]
        fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta")
        alg = SeqGroup(fasta_path)
        if len(alg) != self.size:
            log.warning("Trimming was to aggressive and it tried"
                        " to remove one or more sequences."
                        " Alignment trimming will be disabled for this dataset."
                        )
            self.clean_alg_fasta_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file)
            self.clean_alg_phylip_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file)
        else:
            for line in open(self.jobs[0].stdout_file):
                line = line.strip()
                if line.startswith("#ColumnsMap"):
                    kept_columns = list(map(int, line.split("\t")[1].split(",")))
            fasta = alg.write(format="fasta")
            phylip = alg.write(format="iphylip_relaxed")
            AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
Exemple #27
0
    def __init__(self, bin, args, jobname=None, parent_ids=None):
        # Used at execution time
        self.status = None
        # How to run the app
        self.bin = bin
        # command line arguments
        self.args = args
        # Default number of cores used by the job. If more than 1,
        # this attribute should be changed
        self.cores = 1
        self.exec_type = "insitu"
        self.jobname = jobname

        # generates the unique job identifier based on the params of
        # the app. Some params include path names that can prevent
        # recycling the job, so a clean it.
        clean = lambda x: basename(x) if GLOBALS["basedir"] in x or GLOBALS["tasks_dir"] in x else x
        parsed_id_string = ["%s %s" %(clean(str(pair[0])), clean(str(pair[1])))
                            for pair in six.iteritems(self.args)]
        #print '\n'.join(map(str, self.args.items()))

        self.jobid = md5(','.join(sorted([md5(e) for e in
                                          parsed_id_string])))
        # self.jobid = md5(','.join(sorted([md5(str(pair)) for pair in
        #                                  self.args.iteritems()])))
        if parent_ids:
            self.jobid = md5(','.join(sorted(parent_ids+[self.jobid])))

        if not self.jobname:
            self.jobname = re.sub("[^0-9a-zA-Z]", "-", basename(self.bin))

        self.ifdone_cmd = ""
        self.iffail_cmd = ""
        self.set_jobdir(pjoin(GLOBALS["tasks_dir"], self.jobid))
        self.input_files = {}
        self.dependencies = set()
Exemple #28
0
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender):
    # Adjust debug mode
    if debug == "all":
        log.setLevel(10)
    pending_tasks = set(pending_tasks)

    ## ===================================
    ## INITIALIZE BASIC VARS
    execution, run_detached = execution
    thread2tasks = defaultdict(list)
    for task in pending_tasks:
        thread2tasks[task.configid].append(task)
    expected_threads = set(thread2tasks.keys())
    past_threads = {}
    thread_errors = defaultdict(list)
    ## END OF VARS AND SHORTCUTS
    ## ===================================

    cores_total = GLOBALS["_max_cores"]
    if cores_total > 0:
        job_queue = Queue()

        back_launcher = Process(target=background_job_launcher,
                                args=(job_queue, run_detached,
                                      GLOBALS["launch_time"], cores_total))
        back_launcher.start()
    else:
        job_queue = None
        back_launcher = None

    GLOBALS["_background_scheduler"] = back_launcher
    GLOBALS["_job_queue"] = job_queue


    # Captures Ctrl-C for debuging DEBUG
    #signal.signal(signal.SIGINT, control_c)



    last_report_time = None

    BUG = set()
    try:
        # Enters into task scheduling
        while pending_tasks:
            wtime = schedule_time

            # ask SGE for running jobs
            if execution == "sge":
                #sgeid2jobs = db.get_sge_tasks()
                #qstat_jobs = sge.qstat()
                pass
            else:
                qstat_jobs = None

            # Show summary of pending tasks per thread
            thread2tasks = defaultdict(list)
            for task in pending_tasks:
                thread2tasks[task.configid].append(task)
            set_logindent(0)
            log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime()))
            info_lines = []
            for tid, tlist in six.iteritems(thread2tasks):
                threadname = GLOBALS[tid]["_name"]
                sizelist = ["%s" %getattr(_ts, "size", "?") for _ts in tlist]
                info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" %(
                    threadname, len(tlist), ', '.join(sizelist))
                info_lines.append(info)

            for line in info_lines:
                log.log(28, line)

            if GLOBALS["email"]  and last_report_time is None:
                last_report_time = time()
                send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines))

            ## ================================
            ## CHECK AND UPDATE CURRENT TASKS
            checked_tasks = set()
            check_start_time = time()
            to_add_tasks = set()

            GLOBALS["cached_status"] = {}
            for task in sorted(pending_tasks, sort_tasks):
                # Avoids endless periods without new job submissions
                elapsed_time = time() - check_start_time
                #if not back_launcher and pending_tasks and \
                #        elapsed_time > schedule_time * 2:
                #    log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:")
                #    db.commit()
                #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
                #                        execution, run_detached)
                #    check_start_time = time()

                # Enter debuging mode if necessary
                if debug and log.level > 10 and task.taskid.startswith(debug):
                    log.setLevel(10)
                    log.debug("ENTERING IN DEBUGGING MODE")
                thread2tasks[task.configid].append(task)

                # Update tasks and job statuses

                if task.taskid not in checked_tasks:
                    try:
                        show_task_info(task)
                        task.status = task.get_status(qstat_jobs)
                        db.dataconn.commit()
                        if back_launcher and task.status not in set("DE"):
                            for j, cmd in task.iter_waiting_jobs():
                                j.status = "Q"
                                GLOBALS["cached_status"][j.jobid] = "Q"
                                if j.jobid not in BUG:
                                    if not os.path.exists(j.jobdir):
                                        os.makedirs(j.jobdir)
                                    for ifile, outpath in six.iteritems(j.input_files):
                                        try:
                                            _tid, _did = ifile.split(".")
                                            _did = int(_did)
                                        except (IndexError, ValueError):
                                            dataid = ifile
                                        else:
                                            dataid = db.get_dataid(_tid, _did)

                                        if not outpath:
                                            outfile = pjoin(GLOBALS["input_dir"], ifile)
                                        else:
                                            outfile = pjoin(outpath, ifile)

                                        if not os.path.exists(outfile):
                                            open(outfile, "w").write(db.get_data(dataid))

                                    log.log(24, "  @@8:Queueing @@1: %s from %s" %(j, task))
                                    if execution:
                                        job_queue.put([j.jobid, j.cores, cmd, j.status_file])
                                BUG.add(j.jobid)

                        update_task_states_recursively(task)
                        db.commit()
                        checked_tasks.add(task.taskid)
                    except TaskError as e:
                        log.error("Errors found in %s" %task)
                        import traceback
                        traceback.print_exc()
                        if GLOBALS["email"]:
                            threadname = GLOBALS[task.configid]["_name"]
                            send_mail(GLOBALS["email"], "Errors found in %s!" %threadname,
                                      '\n'.join(map(str, [task, e.value, e.msg])))
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append([task, e.value, e.msg])
                        continue
                else:
                    # Set temporary Queued state to avoids launching
                    # jobs from clones
                    task.status = "Q"
                    if log.level < 24:
                        show_task_info(task)

                if task.status == "D":
                    #db.commit()
                    show_task_info(task)
                    logindent(3)


                    # Log commands of every task
                    if 'cmd_log_file' not in GLOBALS[task.configid]:
                         GLOBALS[task.configid]['cmd_log_file'] = pjoin(GLOBALS[task.configid]["_outpath"], "cmd.log")
                         O = open(GLOBALS[task.configid]['cmd_log_file'], "w")
                         O.close()

                    cmd_lines =  get_cmd_log(task)
                    CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a")
                    print(task, file=CMD_LOG)
                    for c in cmd_lines:
                        print('   '+'\t'.join(map(str, c)), file=CMD_LOG)
                    CMD_LOG.close()
                    #

                    try:
                        #wkname = GLOBALS[task.configid]['_name']
                        create_tasks = workflow_task_processor(task, task.target_wkname)
                    except TaskError as e:
                        log.error("Errors found in %s" %task)
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append([task, e.value, e.msg])
                        continue
                    else:
                        logindent(-3)

                        to_add_tasks.update(create_tasks)
                        pending_tasks.discard(task)

                elif task.status == "E":
                    log.error("task contains errors: %s " %task)
                    log.error("Errors found in %s")
                    pending_tasks.discard(task)
                    thread_errors[task.configid].append([task, None, "Found (E) task status"])

            #db.commit()
            #if not back_launcher:
            #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
            #                    execution, run_detached)

            # Update global task list with recently added jobs to be check
            # during next cycle
            pending_tasks.update(to_add_tasks)

            ## END CHECK AND UPDATE CURRENT TASKS
            ## ================================

            if wtime:
                set_logindent(0)
                log.log(28, "@@13:Waiting %s seconds@@1:" %wtime)
                sleep(wtime)
            else:
                sleep(schedule_time)

            # Dump / show ended threads
            error_lines = []
            for configid, etasks in six.iteritems(thread_errors):
                error_lines.append("Thread @@10:%s@@1: contains errors:" %\
                            (GLOBALS[configid]["_name"]))
                for error in etasks:
                    error_lines.append(" ** %s" %error[0])
                    e_obj = error[1] if error[1] else error[0]
                    error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid
                    if e_obj is not error[0]:
                        error_lines.append("      -> %s" %e_obj)
                    error_lines.append("      -> %s" %error_path)
                    error_lines.append("        -> %s" %error[2])
            for eline in error_lines:
                log.error(eline)

            pending_threads = set([ts.configid for ts in pending_tasks])
            finished_threads = expected_threads - (pending_threads | set(thread_errors.keys()))
            just_finished_lines = []
            finished_lines = []
            for configid in finished_threads:
                # configid is the the same as threadid in master tasks
                final_tree_file = pjoin(GLOBALS[configid]["_outpath"],
                                        GLOBALS["inputname"] + ".final_tree")
                threadname = GLOBALS[configid]["_name"]

                if configid in past_threads:
                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])
                    finished_lines.append("Finished %s in %d iteration(s)" %(
                            threadname, past_threads[configid]))
                else:

                    log.log(28, "Assembling final tree...")
                    main_tree, treeiters =  assembly_tree(configid)
                    past_threads[configid] = treeiters - 1

                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])


                    log.log(28, "Writing final tree for @@13:%s@@1:\n   %s\n   %s",
                            threadname, final_tree_file+".nw",
                            final_tree_file+".nwx (newick extended)")
                    main_tree.write(outfile=final_tree_file+".nw")
                    main_tree.write(outfile=final_tree_file+ ".nwx", features=[],
                                    format_root_node=True)

                    if hasattr(main_tree, "alg_path"):
                        log.log(28, "Writing root node alignment @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".fa")

                        alg = SeqGroup(get_stored_data(main_tree.alg_path))
                        OUT = open(final_tree_file+".fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print(">%s\n%s" %(realname, seq), file=OUT)
                        OUT.close()

                    if hasattr(main_tree, "clean_alg_path"):
                        log.log(28, "Writing root node trimmed alignment @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".trimmed.fa")

                        alg = SeqGroup(get_stored_data(main_tree.clean_alg_path))
                        OUT = open(final_tree_file+".trimmed.fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print(">%s\n%s" %(realname, seq), file=OUT)
                        OUT.close()

                    if norender == False:
                        log.log(28, "Generating tree image for @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".png")
                        for lf in main_tree:
                            lf.add_feature("sequence", alg.get_seq(lf.safename))
                        try:
                            from ete3.tools.phylobuild_lib.visualize import draw_tree
                            draw_tree(main_tree, GLOBALS[configid], final_tree_file+".png")
                        except Exception as e:
                            log.warning('@@8:something went wrong when generating the tree image. Try manually :(@@1:')
                            if DEBUG:
                                import traceback, sys
                                traceback.print_exc(file=sys.stdout)

                    just_finished_lines.append("Finished %s in %d iteration(s)" %(
                            threadname, past_threads[configid]))
            if GLOBALS["email"]:
                if not pending_tasks:
                    all_lines = finished_lines + just_finished_lines + error_lines
                    send_mail(GLOBALS["email"], "Your NPR process has ended", '\n'.join(all_lines))

                elif GLOBALS["email_report_time"] and time() - last_report_time >= \
                        GLOBALS["email_report_time"]:
                    all_lines = info_lines + error_lines + just_finished_lines
                    send_mail(GLOBALS["email"], "Your NPR report", '\n'.join(all_lines))
                    last_report_time = time()

                elif just_finished_lines:
                    send_mail(GLOBALS["email"], "Finished threads!",
                              '\n'.join(just_finished_lines))

            log.log(26, "")
    except:
        raise

    if thread_errors:
        log.error("Done with ERRORS")
    else:
        log.log(28, "Done")

    return thread_errors
Exemple #29
0
    def load_jobs(self):
        args = OrderedDict(self.args)
        args["-s"] = pjoin(GLOBALS["input_dir"], self.alg_phylip_file)
        args["-m"] = self.model_string
        args["-n"] = self.alg_phylip_file
        if self.constrain_tree:
            log.log(24, "Using constrain tree %s" %self.constrain_tree)
            args["-g"] = pjoin(GLOBALS["input_dir"], self.constrain_tree)
        if self.partitions_file:
            log.log(24, "Using alg partitions %s" %self.partitions_file)
            args['-q'] = pjoin(GLOBALS["input_dir"], self.partitions_file)

        tree_job = Job(self.raxml_bin, args, parent_ids=[self.nodeid])
        tree_job.jobname += "-"+self.model_string
        tree_job.cores = self.threads
        # Register input files necessary to run the job
        tree_job.add_input_file(self.alg_phylip_file)
        if self.constrain_tree:
            tree_job.add_input_file(self.constrain_tree)
        if self.partitions_file:
            tree_job.add_input_file(self.partitions_file)

        self.jobs.append(tree_job)
        self.out_tree_file = os.path.join(tree_job.jobdir,
                                     "RAxML_bestTree." + self.alg_phylip_file)

        if self.bootstrap == "alrt":
            alrt_args = tree_job.args.copy()
            if self.constrain_tree:
                del alrt_args["-g"]
            if self.partitions_file:
                alrt_args["-q"] = args['-q']

            alrt_args["-f"] =  "J"
            alrt_args["-t"] = self.out_tree_file
            alrt_job = Job(self.raxml_bin, alrt_args,
                           parent_ids=[tree_job.jobid])
            alrt_job.jobname += "-alrt"
            alrt_job.dependencies.add(tree_job)
            alrt_job.cores = self.threads

            # Register necessary input files
            alrt_job.add_input_file(self.alg_phylip_file)
            if self.partitions_file:
                alrt_job.add_input_file(self.partitions_file)

            self.jobs.append(alrt_job)
            self.alrt_job = alrt_job

        elif self.bootstrap == "alrt_phyml":
            alrt_args = {
                "-o": "n",
                "-i": self.alg_phylip_file,
                "--bootstrap": "-2",
                "-d": self.seqtype,
                "-u": self.out_tree_file,
                "--model": self.model,
                "--quiet": "",
                "--no_memory_check": "",
                }
            #if self.constrain_tree:
            #    alrt_args["--constraint_tree"] = self.constrain_tree

            alrt_job = Job(self.conf["app"]["phyml"],
                           alrt_args, parent_ids=[tree_job.jobid])
            alrt_job.add_input_file(self.alg_phylip_file, alrt_job.jobdir)
            alrt_job.jobname += "-alrt"
            alrt_job.dependencies.add(tree_job)
            alrt_job.add_input_file(self.alg_phylip_file)
            self.jobs.append(alrt_job)
            self.alrt_job = alrt_job

        else:
            # Bootstrap calculation
            boot_args = tree_job.args.copy()
            boot_args["-n"] = "bootstraps."+boot_args["-n"]
            boot_args["-N"] = int(self.bootstrap)
            boot_args["-b"] = 31416
            boot_job = Job(self.raxml_bin, boot_args,
                           parent_ids=[tree_job.jobid])
            boot_job.jobname += "-%d-bootstraps" %(boot_args['-N'])
            boot_job.dependencies.add(tree_job)
            boot_job.cores = self.threads

            # Register necessary input files
            boot_job.add_input_file(self.alg_phylip_file)
            if self.constrain_tree:
                boot_job.add_input_file(self.constrain_tree)
            if self.partitions_file:
                boot_job.add_input_file(self.partitions_file)

            self.jobs.append(boot_job)

            # Bootstrap drawing on top of best tree
            bootd_args = tree_job.args.copy()
            if self.constrain_tree:
                del bootd_args["-g"]
            if self.partitions_file:
                del bootd_args["-q"]

            bootd_args["-n"] = "bootstrapped."+ tree_job.args["-n"]
            bootd_args["-f"] = "b"
            bootd_args["-t"] = self.out_tree_file
            bootd_args["-z"] = pjoin(boot_job.jobdir, "RAxML_bootstrap." + boot_job.args["-n"])

            bootd_job = Job(self.raxml_bin, bootd_args,
                            parent_ids=[tree_job.jobid])
            bootd_job.jobname += "-bootstrapped"
            bootd_job.dependencies.add(boot_job)
            bootd_job.cores = self.threads
            self.jobs.append(bootd_job)

            self.boot_job = boot_job
            self.bootd_job = bootd_job
Exemple #30
0
def process_task(task, wkname, npr_conf, nodeid2info):
    alignerconf, alignerclass = npr_conf.aligner
    cleanerconf, cleanerclass = npr_conf.alg_cleaner
    mtesterconf, mtesterclass = npr_conf.model_tester
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    if not treebuilderclass:
        # Allows to dump algs in workflows with no tree tasks
        treebuilderclass = DummyTree

    splitterconf, splitterclass = npr_conf.tree_splitter

    conf = GLOBALS[task.configid]
    seqtype = task.seqtype
    nodeid = task.nodeid
    ttype = task.ttype
    taskid = task.taskid
    threadid = task.threadid
    node_info = nodeid2info[nodeid]
    size = task.size#node_info.get("size", 0)
    target_seqs = node_info.get("target_seqs", [])
    out_seqs = node_info.get("out_seqs", [])

    if not treebuilderclass or size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree

    # If more than one outgroup are used, enable the use of constrain
    if out_seqs and len(out_seqs) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None

    new_tasks = []
    if ttype == "msf":
        # Register Tree constrains
        constrain_tree = "(%s, (%s));" %(','.join(sorted(task.out_seqs)),
                                         ','.join(sorted(task.target_seqs)))
        _outs = "\n".join([">%s\n0" %name for name in sorted(task.out_seqs)])
        _tars = "\n".join([">%s\n1" %name for name in sorted(task.target_seqs)])
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit() # since the creation of some Task
                               # objects may require this info, I need
                               # to commit right now.

        # Register node
        db.add_node(task.threadid,
                    task.nodeid, task.cladeid,
                    task.target_seqs,
                    task.out_seqs)

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = task.target_seqs
        nodeid2info[nodeid]["out_seqs"] = task.out_seqs
        alg_task = alignerclass(nodeid, task.multiseq_file,
                                seqtype, conf, alignerconf)
        alg_task.size = task.size
        new_tasks.append(alg_task)


    elif ttype == "alg" or ttype == "acleaner":
        if ttype == "alg":
            nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file
        elif ttype == "acleaner":
            nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file

        alg_fasta_file = getattr(task, "clean_alg_fasta_file",
                                 task.alg_fasta_file)
        alg_phylip_file = getattr(task, "clean_alg_phylip_file",
                                  task.alg_phylip_file)

        # Calculate alignment stats
        # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file,
        #                                        conf["app"]["trimal"])
        #
        # max_identity = get_trimal_identity(task.alg_fasta_file,
        #                                 conf["app"]["trimal"])
        # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std)
        # log.info("Max. Identity: %0.2f", max_identity)
        #import time
        #t1 = time.time()
        #mx, mn, mean, std = get_identity(task.alg_fasta_file)
        #print time.time()-t1
        #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        #t1 = time.time()

        if seqtype == "aa" and npr_conf.switch_aa_similarity < 1:
            try:
                alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats)
            except Exception as e:
                alg_stats = {}

            if ttype == "alg":
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                dataid = DATATYPES.alg_phylip
            elif ttype == "acleaner":
                algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file)
                dataid = DATATYPES.clean_alg_phylip

            if "i_mean" not in alg_stats:
                log.log(24, "Calculating alignment stats...")
                # dump data if necesary
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                if not pexist(algfile):
                    # dump phylip alg
                    open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid)))

                mx, mn, mean, std = get_statal_identity(algfile,
                                                        conf["app"]["statal"])
                alg_stats = {"i_max":mx, "i_mean":mean, "i_min":mn, "i_std":std}
                db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats)

            log.log(22, "Alignment stats (sequence similarity):")
            log.log(22, "   max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" %
                    (alg_stats))

        else:
            alg_stats = {"i_max":-1, "i_mean":-1, "i_min":-1, "i_std":-1}

        #print time.time()-t1
        #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        task.max_ident = alg_stats["i_max"]
        task.min_ident = alg_stats["i_min"]
        task.mean_ident = alg_stats["i_mean"]
        task.std_ident = alg_stats["i_std"]
        next_task = None

        if ttype == "alg" and cleanerclass:
            next_task = cleanerclass(nodeid, seqtype, alg_fasta_file,
                                     alg_phylip_file,
                                     conf, cleanerconf)
        else:
            # Converts aa alignment into nt if necessary
            if  seqtype == "aa" and \
                    "nt" in GLOBALS["seqtypes"] and \
                    task.mean_ident >= npr_conf.switch_aa_similarity:
                log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\
                        (task.mean_ident, npr_conf.switch_aa_similarity))
                alg_fasta_file = "%s.%s" %(taskid, DATATYPES.alg_nt_fasta)
                alg_phylip_file = "%s.%s" %(taskid, DATATYPES.alg_nt_phylip)
                try:
                    alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta)
                    alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip)
                except ValueError:
                    log.log(22, "Calculating codon alignment...")

                    source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file)
                    if ttype == "alg":
                        kept_columns = []
                    elif ttype == "acleaner":
                        # if original alignment was trimmed, use it as reference
                        # but make the nt alignment only on the kept columns
                        kept_columns = db.get_task_data(taskid, DATATYPES.kept_alg_columns)

                    if not pexist(source_alg):
                        open(source_alg, "w").write(db.get_task_data(taskid, DATATYPES.alg_fasta))

                    nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns)
                    db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write())
                    db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed'))

                npr_conf = IterConfig(conf, wkname, task.size, "nt")
                seqtype = "nt"

            if mtesterclass:
                next_task = mtesterclass(nodeid, alg_fasta_file,
                                         alg_phylip_file,
                                         constrain_id,
                                         conf, mtesterconf)
            elif treebuilderclass:
                next_task = treebuilderclass(nodeid, alg_phylip_file,
                                             constrain_id,
                                             None, seqtype,
                                             conf, treebuilderconf)
        if next_task:
            next_task.size = task.size
            new_tasks.append(next_task)

    elif ttype == "mchooser":
        if treebuilderclass:
            alg_fasta_file = task.alg_fasta_file
            alg_phylip_file = task.alg_phylip_file
            model = task.best_model
            tree_task = treebuilderclass(nodeid, alg_phylip_file,
                                         constrain_id,
                                         model, seqtype,
                                         conf, treebuilderconf)
            tree_task.size = task.size
            new_tasks.append(tree_task)

    elif ttype == "tree":
        treemerge_task = splitterclass(nodeid, seqtype,
                                       task.tree_file, conf, splitterconf)
            #if conf["tree_splitter"]["_outgroup_size"]:
            #    treemerge_task = TreeSplitterWithOutgroups(nodeid, seqtype, task.tree_file, main_tree, conf)
            #else:
            #    treemerge_task = TreeSplitter(nodeid, seqtype, task.tree_file, main_tree, conf)

        treemerge_task.size = task.size
        new_tasks.append(treemerge_task)

    elif ttype == "treemerger":
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid,
                       runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree
                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(target_seqs), len(out_seqs))
                alg_path = node_info.get("clean_alg_path", node_info["alg_path"])
                for node, seqs, outs, wkname in get_next_npr_node(threadid, ttree,
                                                          task.out_seqs, mtree,
                                                          alg_path, npr_conf):
                    log.log(24, "Registering new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = Msf(seqs, outs, seqtype=source_seqtype)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
    return new_tasks
Exemple #31
0
    def load_jobs(self):
        readal_bin = self.conf["app"]["readal"]
        trimal_bin = self.conf["app"]["trimal"]
        input_dir = GLOBALS["input_dir"]
        multiseq_file = pjoin(input_dir, self.multiseq_file)
        multiseq_file_r = pjoin(input_dir, self.multiseq_file+"_reversed")

        first = seq_reverser_job(multiseq_file, multiseq_file_r,
                                 [self.nodeid], readal_bin)
        #print self.multiseq_file
        first.add_input_file(self.multiseq_file)
        self.jobs.append(first)

        all_alg_names = []
        mcoffee_parents = []
        for aligner_name in self.conf[self.confname]["_aligners"]:
            aligner_name = aligner_name[1:]
            _classname = APP2CLASS[self.conf[aligner_name]["_app"]]

            _module = __import__(CLASS2MODULE[_classname], globals(), locals(), [], -1)
            _aligner = getattr(_module, _classname)

            # Normal alg
            task1 = _aligner(self.nodeid, self.multiseq_file, self.seqtype,
                             self.conf, aligner_name)
            task1.size = self.size
            self.jobs.append(task1)
            all_alg_names.append(task1.alg_fasta_file)


            # Alg of the reverse
            task2 = _aligner(self.nodeid, self.multiseq_file+"_reversed",
                             self.seqtype, self.conf, aligner_name)
            task2.size = self.size
            task2.dependencies.add(first)
            self.jobs.append(task2)

            # Restore reverse alg
            reverse_out = pjoin(input_dir, task2.alg_fasta_file)
            task3 = seq_reverser_job(reverse_out,
                                     reverse_out+"_restored",
                                     [task2.taskid], readal_bin)
            task3.dependencies.add(task2)
            task3.add_input_file(task2.alg_fasta_file)
            all_alg_names.append(reverse_out+"_restored")
            self.jobs.append(task3)
            mcoffee_parents.extend([task1.taskid, task2.taskid])

        # Combine signal from all algs using Mcoffee
        mcoffee_task = MCoffee(self.nodeid, self.seqtype, all_alg_names,
                               self.conf, self.confname, parent_ids=mcoffee_parents)
        # reversed algs are not actually saved into db, but it should
        # be present since the reverser job is always executed
        mcoffee_task.dependencies.update(list(self.jobs))
        self.jobs.append(mcoffee_task)

        if self.conf[self.confname]["_alg_trimming"]:
            trimming_cutoff = 1.0 / len(all_alg_names)
            targs = {}
            targs["-forceselect"] = pjoin(input_dir, mcoffee_task.alg_fasta_file)
            targs["-compareset"] = pjoin(input_dir, mcoffee_task.alg_list_file)
            targs["-out"] = "mcoffee.trimmed.fasta"
            targs["-fasta"] = ""
            targs["-ct"] = trimming_cutoff
            trim_job = Job(trimal_bin, targs, parent_ids=[mcoffee_task.taskid])
            trim_job.jobname = "McoffeeTrimming"
            trim_job.dependencies.add(mcoffee_task)
            trim_job.alg_fasta_file = targs["-out"]
            for key in all_alg_names:
                trim_job.add_input_file(key)
            trim_job.add_input_file(mcoffee_task.alg_fasta_file)
            trim_job.add_input_file(mcoffee_task.alg_list_file)
            self.jobs.append(trim_job)
Exemple #32
0
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution,
             debug, norender):
    # Adjust debug mode
    if debug == "all":
        log.setLevel(10)
    pending_tasks = set(pending_tasks)

    ## ===================================
    ## INITIALIZE BASIC VARS
    execution, run_detached = execution
    thread2tasks = defaultdict(list)
    for task in pending_tasks:
        thread2tasks[task.configid].append(task)
    expected_threads = set(thread2tasks.keys())
    past_threads = {}
    thread_errors = defaultdict(list)
    ## END OF VARS AND SHORTCUTS
    ## ===================================

    cores_total = GLOBALS["_max_cores"]
    if cores_total > 0:
        job_queue = Queue()

        back_launcher = Process(target=background_job_launcher,
                                args=(job_queue, run_detached,
                                      GLOBALS["launch_time"], cores_total))
        back_launcher.start()
    else:
        job_queue = None
        back_launcher = None

    GLOBALS["_background_scheduler"] = back_launcher
    GLOBALS["_job_queue"] = job_queue

    # Captures Ctrl-C for debuging DEBUG
    #signal.signal(signal.SIGINT, control_c)

    last_report_time = None

    BUG = set()
    try:
        # Enters into task scheduling
        while pending_tasks:
            wtime = schedule_time

            # ask SGE for running jobs
            if execution == "sge":
                #sgeid2jobs = db.get_sge_tasks()
                #qstat_jobs = sge.qstat()
                pass
            else:
                qstat_jobs = None

            # Show summary of pending tasks per thread
            thread2tasks = defaultdict(list)
            for task in pending_tasks:
                thread2tasks[task.configid].append(task)
            set_logindent(0)
            log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime()))
            info_lines = []
            for tid, tlist in six.iteritems(thread2tasks):
                threadname = GLOBALS[tid]["_name"]
                sizelist = ["%s" % getattr(_ts, "size", "?") for _ts in tlist]
                info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" % (
                    threadname, len(tlist), ', '.join(sizelist))
                info_lines.append(info)

            for line in info_lines:
                log.log(28, line)

            if GLOBALS["email"] and last_report_time is None:
                last_report_time = time()
                send_mail(GLOBALS["email"], "Your NPR process has started",
                          '\n'.join(info_lines))

            ## ================================
            ## CHECK AND UPDATE CURRENT TASKS
            checked_tasks = set()
            check_start_time = time()
            to_add_tasks = set()

            GLOBALS["cached_status"] = {}
            for task in sorted(pending_tasks, sort_tasks):
                # Avoids endless periods without new job submissions
                elapsed_time = time() - check_start_time
                #if not back_launcher and pending_tasks and \
                #        elapsed_time > schedule_time * 2:
                #    log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:")
                #    db.commit()
                #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
                #                        execution, run_detached)
                #    check_start_time = time()

                # Enter debuging mode if necessary
                if debug and log.level > 10 and task.taskid.startswith(debug):
                    log.setLevel(10)
                    log.debug("ENTERING IN DEBUGGING MODE")
                thread2tasks[task.configid].append(task)

                # Update tasks and job statuses

                if task.taskid not in checked_tasks:
                    try:
                        show_task_info(task)
                        task.status = task.get_status(qstat_jobs)
                        db.dataconn.commit()
                        if back_launcher and task.status not in set("DE"):
                            for j, cmd in task.iter_waiting_jobs():
                                j.status = "Q"
                                GLOBALS["cached_status"][j.jobid] = "Q"
                                if j.jobid not in BUG:
                                    if not os.path.exists(j.jobdir):
                                        os.makedirs(j.jobdir)
                                    for ifile, outpath in six.iteritems(
                                            j.input_files):
                                        try:
                                            _tid, _did = ifile.split(".")
                                            _did = int(_did)
                                        except (IndexError, ValueError):
                                            dataid = ifile
                                        else:
                                            dataid = db.get_dataid(_tid, _did)

                                        if not outpath:
                                            outfile = pjoin(
                                                GLOBALS["input_dir"], ifile)
                                        else:
                                            outfile = pjoin(outpath, ifile)

                                        if not os.path.exists(outfile):
                                            open(outfile, "w").write(
                                                db.get_data(dataid))

                                    log.log(
                                        24, "  @@8:Queueing @@1: %s from %s" %
                                        (j, task))
                                    if execution:
                                        job_queue.put([
                                            j.jobid, j.cores, cmd,
                                            j.status_file
                                        ])
                                BUG.add(j.jobid)

                        update_task_states_recursively(task)
                        db.commit()
                        checked_tasks.add(task.taskid)
                    except TaskError as e:
                        log.error("Errors found in %s" % task)
                        import traceback
                        traceback.print_exc()
                        if GLOBALS["email"]:
                            threadname = GLOBALS[task.configid]["_name"]
                            send_mail(
                                GLOBALS["email"],
                                "Errors found in %s!" % threadname,
                                '\n'.join(map(str, [task, e.value, e.msg])))
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append(
                            [task, e.value, e.msg])
                        continue
                else:
                    # Set temporary Queued state to avoids launching
                    # jobs from clones
                    task.status = "Q"
                    if log.level < 24:
                        show_task_info(task)

                if task.status == "D":
                    #db.commit()
                    show_task_info(task)
                    logindent(3)

                    # Log commands of every task
                    if 'cmd_log_file' not in GLOBALS[task.configid]:
                        GLOBALS[task.configid]['cmd_log_file'] = pjoin(
                            GLOBALS[task.configid]["_outpath"], "cmd.log")
                        O = open(GLOBALS[task.configid]['cmd_log_file'], "w")
                        O.close()

                    cmd_lines = get_cmd_log(task)
                    CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a")
                    print(task, file=CMD_LOG)
                    for c in cmd_lines:
                        print('   ' + '\t'.join(map(str, c)), file=CMD_LOG)
                    CMD_LOG.close()
                    #

                    try:
                        #wkname = GLOBALS[task.configid]['_name']
                        create_tasks = workflow_task_processor(
                            task, task.target_wkname)
                    except TaskError as e:
                        log.error("Errors found in %s" % task)
                        pending_tasks.discard(task)
                        thread_errors[task.configid].append(
                            [task, e.value, e.msg])
                        continue
                    else:
                        logindent(-3)

                        to_add_tasks.update(create_tasks)
                        pending_tasks.discard(task)

                elif task.status == "E":
                    log.error("task contains errors: %s " % task)
                    log.error("Errors found in %s")
                    pending_tasks.discard(task)
                    thread_errors[task.configid].append(
                        [task, None, "Found (E) task status"])

            #db.commit()
            #if not back_launcher:
            #    wtime = launch_jobs(sorted(pending_tasks, sort_tasks),
            #                    execution, run_detached)

            # Update global task list with recently added jobs to be check
            # during next cycle
            pending_tasks.update(to_add_tasks)

            ## END CHECK AND UPDATE CURRENT TASKS
            ## ================================

            if wtime:
                set_logindent(0)
                log.log(28, "@@13:Waiting %s seconds@@1:" % wtime)
                sleep(wtime)
            else:
                sleep(schedule_time)

            # Dump / show ended threads
            error_lines = []
            for configid, etasks in six.iteritems(thread_errors):
                error_lines.append("Thread @@10:%s@@1: contains errors:" %\
                            (GLOBALS[configid]["_name"]))
                for error in etasks:
                    error_lines.append(" ** %s" % error[0])
                    e_obj = error[1] if error[1] else error[0]
                    error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid
                    if e_obj is not error[0]:
                        error_lines.append("      -> %s" % e_obj)
                    error_lines.append("      -> %s" % error_path)
                    error_lines.append("        -> %s" % error[2])
            for eline in error_lines:
                log.error(eline)

            pending_threads = set([ts.configid for ts in pending_tasks])
            finished_threads = expected_threads - (pending_threads
                                                   | set(thread_errors.keys()))
            just_finished_lines = []
            finished_lines = []
            for configid in finished_threads:
                # configid is the the same as threadid in master tasks
                final_tree_file = pjoin(GLOBALS[configid]["_outpath"],
                                        GLOBALS["inputname"] + ".final_tree")
                threadname = GLOBALS[configid]["_name"]

                if configid in past_threads:
                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])
                    finished_lines.append("Finished %s in %d iteration(s)" %
                                          (threadname, past_threads[configid]))
                else:

                    log.log(28, "Assembling final tree...")
                    main_tree, treeiters = assembly_tree(configid)
                    past_threads[configid] = treeiters - 1

                    log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)",
                            threadname, past_threads[configid])

                    log.log(
                        28, "Writing final tree for @@13:%s@@1:\n   %s\n   %s",
                        threadname, final_tree_file + ".nw",
                        final_tree_file + ".nwx (newick extended)")
                    main_tree.write(outfile=final_tree_file + ".nw")
                    main_tree.write(outfile=final_tree_file + ".nwx",
                                    features=[],
                                    format_root_node=True)

                    if hasattr(main_tree, "alg_path"):
                        log.log(
                            28,
                            "Writing root node alignment @@13:%s@@1:\n   %s",
                            threadname, final_tree_file + ".fa")

                        alg = SeqGroup(get_stored_data(main_tree.alg_path))
                        OUT = open(final_tree_file + ".fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print(">%s\n%s" % (realname, seq), file=OUT)
                        OUT.close()

                    if hasattr(main_tree, "clean_alg_path"):
                        log.log(
                            28,
                            "Writing root node trimmed alignment @@13:%s@@1:\n   %s",
                            threadname, final_tree_file + ".trimmed.fa")

                        alg = SeqGroup(
                            get_stored_data(main_tree.clean_alg_path))
                        OUT = open(final_tree_file + ".trimmed.fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print(">%s\n%s" % (realname, seq), file=OUT)
                        OUT.close()

                    if norender == False:
                        log.log(
                            28, "Generating tree image for @@13:%s@@1:\n   %s",
                            threadname, final_tree_file + ".png")
                        for lf in main_tree:
                            lf.add_feature("sequence",
                                           alg.get_seq(lf.safename))
                        try:
                            from ete3.tools.phylobuild_lib.visualize import draw_tree
                            draw_tree(main_tree, GLOBALS[configid],
                                      final_tree_file + ".png")
                        except Exception as e:
                            log.warning(
                                '@@8:something went wrong when generating the tree image. Try manually :(@@1:'
                            )
                            if DEBUG:
                                import traceback, sys
                                traceback.print_exc(file=sys.stdout)

                    just_finished_lines.append(
                        "Finished %s in %d iteration(s)" %
                        (threadname, past_threads[configid]))
            if GLOBALS["email"]:
                if not pending_tasks:
                    all_lines = finished_lines + just_finished_lines + error_lines
                    send_mail(GLOBALS["email"], "Your NPR process has ended",
                              '\n'.join(all_lines))

                elif GLOBALS["email_report_time"] and time() - last_report_time >= \
                        GLOBALS["email_report_time"]:
                    all_lines = info_lines + error_lines + just_finished_lines
                    send_mail(GLOBALS["email"], "Your NPR report",
                              '\n'.join(all_lines))
                    last_report_time = time()

                elif just_finished_lines:
                    send_mail(GLOBALS["email"], "Finished threads!",
                              '\n'.join(just_finished_lines))

            log.log(26, "")
    except:
        raise

    if thread_errors:
        log.error("Done with ERRORS")
    else:
        log.log(28, "Done")

    return thread_errors
Exemple #33
0
    def load_jobs(self):
        args = OrderedDict(self.args)
        args["-s"] = pjoin(GLOBALS["input_dir"], self.alg_phylip_file)
        args["-m"] = self.model_string
        args["-n"] = self.alg_phylip_file
        if self.constrain_tree:
            log.log(24, "Using constrain tree %s" % self.constrain_tree)
            args["-g"] = pjoin(GLOBALS["input_dir"], self.constrain_tree)
        if self.partitions_file:
            log.log(24, "Using alg partitions %s" % self.partitions_file)
            args['-q'] = pjoin(GLOBALS["input_dir"], self.partitions_file)

        tree_job = Job(self.raxml_bin, args, parent_ids=[self.nodeid])
        tree_job.jobname += "-" + self.model_string
        tree_job.cores = self.threads
        # Register input files necessary to run the job
        tree_job.add_input_file(self.alg_phylip_file)
        if self.constrain_tree:
            tree_job.add_input_file(self.constrain_tree)
        if self.partitions_file:
            tree_job.add_input_file(self.partitions_file)

        self.jobs.append(tree_job)
        self.out_tree_file = os.path.join(
            tree_job.jobdir, "RAxML_bestTree." + self.alg_phylip_file)

        if self.bootstrap == "alrt":
            alrt_args = tree_job.args.copy()
            if self.constrain_tree:
                del alrt_args["-g"]
            if self.partitions_file:
                alrt_args["-q"] = args['-q']

            alrt_args["-f"] = "J"
            alrt_args["-t"] = self.out_tree_file
            alrt_job = Job(self.raxml_bin,
                           alrt_args,
                           parent_ids=[tree_job.jobid])
            alrt_job.jobname += "-alrt"
            alrt_job.dependencies.add(tree_job)
            alrt_job.cores = self.threads

            # Register necessary input files
            alrt_job.add_input_file(self.alg_phylip_file)
            if self.partitions_file:
                alrt_job.add_input_file(self.partitions_file)

            self.jobs.append(alrt_job)
            self.alrt_job = alrt_job

        elif self.bootstrap == "alrt_phyml":
            alrt_args = {
                "-o": "n",
                "-i": self.alg_phylip_file,
                "--bootstrap": "-2",
                "-d": self.seqtype,
                "-u": self.out_tree_file,
                "--model": self.model,
                "--quiet": "",
                "--no_memory_check": "",
            }
            #if self.constrain_tree:
            #    alrt_args["--constraint_tree"] = self.constrain_tree

            alrt_job = Job(self.conf["app"]["phyml"],
                           alrt_args,
                           parent_ids=[tree_job.jobid])
            alrt_job.add_input_file(self.alg_phylip_file, alrt_job.jobdir)
            alrt_job.jobname += "-alrt"
            alrt_job.dependencies.add(tree_job)
            alrt_job.add_input_file(self.alg_phylip_file)
            self.jobs.append(alrt_job)
            self.alrt_job = alrt_job

        else:
            # Bootstrap calculation
            boot_args = tree_job.args.copy()
            boot_args["-n"] = "bootstraps." + boot_args["-n"]
            boot_args["-N"] = int(self.bootstrap)
            boot_args["-b"] = 31416
            boot_job = Job(self.raxml_bin,
                           boot_args,
                           parent_ids=[tree_job.jobid])
            boot_job.jobname += "-%d-bootstraps" % (boot_args['-N'])
            boot_job.dependencies.add(tree_job)
            boot_job.cores = self.threads

            # Register necessary input files
            boot_job.add_input_file(self.alg_phylip_file)
            if self.constrain_tree:
                boot_job.add_input_file(self.constrain_tree)
            if self.partitions_file:
                boot_job.add_input_file(self.partitions_file)

            self.jobs.append(boot_job)

            # Bootstrap drawing on top of best tree
            bootd_args = tree_job.args.copy()
            if self.constrain_tree:
                del bootd_args["-g"]
            if self.partitions_file:
                del bootd_args["-q"]

            bootd_args["-n"] = "bootstrapped." + tree_job.args["-n"]
            bootd_args["-f"] = "b"
            bootd_args["-t"] = self.out_tree_file
            bootd_args["-z"] = pjoin(boot_job.jobdir,
                                     "RAxML_bootstrap." + boot_job.args["-n"])

            bootd_job = Job(self.raxml_bin,
                            bootd_args,
                            parent_ids=[tree_job.jobid])
            bootd_job.jobname += "-bootstrapped"
            bootd_job.dependencies.add(boot_job)
            bootd_job.cores = self.threads
            self.jobs.append(bootd_job)

            self.boot_job = boot_job
            self.bootd_job = bootd_job
Exemple #34
0
def process_task(task, wkname, npr_conf, nodeid2info):
    alignerconf, alignerclass = npr_conf.aligner
    cleanerconf, cleanerclass = npr_conf.alg_cleaner
    mtesterconf, mtesterclass = npr_conf.model_tester
    treebuilderconf, treebuilderclass = npr_conf.tree_builder
    if not treebuilderclass:
        # Allows to dump algs in workflows with no tree tasks
        treebuilderclass = DummyTree

    splitterconf, splitterclass = npr_conf.tree_splitter

    conf = GLOBALS[task.configid]
    seqtype = task.seqtype
    nodeid = task.nodeid
    ttype = task.ttype
    taskid = task.taskid
    threadid = task.threadid
    node_info = nodeid2info[nodeid]
    size = task.size  #node_info.get("size", 0)
    target_seqs = node_info.get("target_seqs", [])
    out_seqs = node_info.get("out_seqs", [])

    if not treebuilderclass or size < 4:
        # Allows to dump algs in workflows with no tree tasks or if tree
        # inference does not make sense given the number of sequences. DummyTree
        # will produce a fake fully collapsed newick tree.
        treebuilderclass = DummyTree

    # If more than one outgroup are used, enable the use of constrain
    if out_seqs and len(out_seqs) > 1:
        constrain_id = nodeid
    else:
        constrain_id = None

    new_tasks = []
    if ttype == "msf":
        # Register Tree constrains
        constrain_tree = "(%s, (%s));" % (','.join(sorted(
            task.out_seqs)), ','.join(sorted(task.target_seqs)))
        _outs = "\n".join([">%s\n0" % name for name in sorted(task.out_seqs)])
        _tars = "\n".join(
            [">%s\n1" % name for name in sorted(task.target_seqs)])
        constrain_alg = '\n'.join([_outs, _tars])
        db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree)
        db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg)
        db.dataconn.commit()  # since the creation of some Task
        # objects may require this info, I need
        # to commit right now.

        # Register node
        db.add_node(task.threadid, task.nodeid, task.cladeid, task.target_seqs,
                    task.out_seqs)

        nodeid2info[nodeid]["size"] = task.size
        nodeid2info[nodeid]["target_seqs"] = task.target_seqs
        nodeid2info[nodeid]["out_seqs"] = task.out_seqs
        alg_task = alignerclass(nodeid, task.multiseq_file, seqtype, conf,
                                alignerconf)
        alg_task.size = task.size
        new_tasks.append(alg_task)

    elif ttype == "alg" or ttype == "acleaner":
        if ttype == "alg":
            nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file
        elif ttype == "acleaner":
            nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file

        alg_fasta_file = getattr(task, "clean_alg_fasta_file",
                                 task.alg_fasta_file)
        alg_phylip_file = getattr(task, "clean_alg_phylip_file",
                                  task.alg_phylip_file)

        # Calculate alignment stats
        # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file,
        #                                        conf["app"]["trimal"])
        #
        # max_identity = get_trimal_identity(task.alg_fasta_file,
        #                                 conf["app"]["trimal"])
        # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std)
        # log.info("Max. Identity: %0.2f", max_identity)
        #import time
        #t1 = time.time()
        #mx, mn, mean, std = get_identity(task.alg_fasta_file)
        #print time.time()-t1
        #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        #t1 = time.time()

        if seqtype == "aa" and npr_conf.switch_aa_similarity < 1:
            try:
                alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats)
            except Exception as e:
                alg_stats = {}

            if ttype == "alg":
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                dataid = DATATYPES.alg_phylip
            elif ttype == "acleaner":
                algfile = pjoin(GLOBALS["input_dir"],
                                task.clean_alg_phylip_file)
                dataid = DATATYPES.clean_alg_phylip

            if "i_mean" not in alg_stats:
                log.log(24, "Calculating alignment stats...")
                # dump data if necesary
                algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file)
                if not pexist(algfile):
                    # dump phylip alg
                    open(algfile,
                         "w").write(db.get_data(db.get_dataid(taskid, dataid)))

                mx, mn, mean, std = get_statal_identity(
                    algfile, conf["app"]["statal"])
                alg_stats = {
                    "i_max": mx,
                    "i_mean": mean,
                    "i_min": mn,
                    "i_std": std
                }
                db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats)

            log.log(22, "Alignment stats (sequence similarity):")
            log.log(
                22,
                "   max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f"
                % (alg_stats))

        else:
            alg_stats = {"i_max": -1, "i_mean": -1, "i_min": -1, "i_std": -1}

        #print time.time()-t1
        #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f",
        #        mx, mn, mean, std)
        task.max_ident = alg_stats["i_max"]
        task.min_ident = alg_stats["i_min"]
        task.mean_ident = alg_stats["i_mean"]
        task.std_ident = alg_stats["i_std"]
        next_task = None

        if ttype == "alg" and cleanerclass:
            next_task = cleanerclass(nodeid, seqtype, alg_fasta_file,
                                     alg_phylip_file, conf, cleanerconf)
        else:
            # Converts aa alignment into nt if necessary
            if  seqtype == "aa" and \
                    "nt" in GLOBALS["seqtypes"] and \
                    task.mean_ident >= npr_conf.switch_aa_similarity:
                log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\
                        (task.mean_ident, npr_conf.switch_aa_similarity))
                alg_fasta_file = "%s.%s" % (taskid, DATATYPES.alg_nt_fasta)
                alg_phylip_file = "%s.%s" % (taskid, DATATYPES.alg_nt_phylip)
                try:
                    alg_fasta_file = db.get_dataid(taskid,
                                                   DATATYPES.alg_nt_fasta)
                    alg_fasta_file = db.get_dataid(taskid,
                                                   DATATYPES.alg_nt_phylip)
                except ValueError:
                    log.log(22, "Calculating codon alignment...")

                    source_alg = pjoin(GLOBALS["input_dir"],
                                       task.alg_fasta_file)
                    if ttype == "alg":
                        kept_columns = []
                    elif ttype == "acleaner":
                        # if original alignment was trimmed, use it as reference
                        # but make the nt alignment only on the kept columns
                        kept_columns = db.get_task_data(
                            taskid, DATATYPES.kept_alg_columns)

                    if not pexist(source_alg):
                        open(source_alg, "w").write(
                            db.get_task_data(taskid, DATATYPES.alg_fasta))

                    nt_alg = switch_to_codon(source_alg,
                                             kept_columns=kept_columns)
                    db.add_task_data(taskid, DATATYPES.alg_nt_fasta,
                                     nt_alg.write())
                    db.add_task_data(taskid, DATATYPES.alg_nt_phylip,
                                     nt_alg.write(format='iphylip_relaxed'))

                npr_conf = IterConfig(conf, wkname, task.size, "nt")
                seqtype = "nt"

            if mtesterclass:
                next_task = mtesterclass(nodeid, alg_fasta_file,
                                         alg_phylip_file, constrain_id, conf,
                                         mtesterconf)
            elif treebuilderclass:
                next_task = treebuilderclass(nodeid, alg_phylip_file,
                                             constrain_id, None, seqtype, conf,
                                             treebuilderconf)
        if next_task:
            next_task.size = task.size
            new_tasks.append(next_task)

    elif ttype == "mchooser":
        if treebuilderclass:
            alg_fasta_file = task.alg_fasta_file
            alg_phylip_file = task.alg_phylip_file
            model = task.best_model
            tree_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id,
                                         model, seqtype, conf, treebuilderconf)
            tree_task.size = task.size
            new_tasks.append(tree_task)

    elif ttype == "tree":
        treemerge_task = splitterclass(nodeid, seqtype, task.tree_file, conf,
                                       splitterconf)
        #if conf["tree_splitter"]["_outgroup_size"]:
        #    treemerge_task = TreeSplitterWithOutgroups(nodeid, seqtype, task.tree_file, main_tree, conf)
        #else:
        #    treemerge_task = TreeSplitter(nodeid, seqtype, task.tree_file, main_tree, conf)

        treemerge_task.size = task.size
        new_tasks.append(treemerge_task)

    elif ttype == "treemerger":
        if not task.task_tree:
            task.finish()

        log.log(24, "Saving task tree...")
        annotate_node(task.task_tree, task)
        db.update_node(nid=task.nodeid,
                       runid=task.threadid,
                       newick=db.encode(task.task_tree))
        db.commit()

        if not isinstance(treebuilderclass,
                          DummyTree) and npr_conf.max_iters > 1:
            current_iter = get_iternumber(threadid)
            if npr_conf.max_iters and current_iter >= npr_conf.max_iters:
                log.warning("Maximum number of iterations reached!")
            else:
                # Add new nodes
                source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt"
                ttree, mtree = task.task_tree, task.main_tree
                log.log(26, "Processing tree: %s seqs, %s outgroups",
                        len(target_seqs), len(out_seqs))
                alg_path = node_info.get("clean_alg_path",
                                         node_info["alg_path"])
                for node, seqs, outs, wkname in get_next_npr_node(
                        threadid, ttree, task.out_seqs, mtree, alg_path,
                        npr_conf):
                    log.log(24, "Registering new node: %s seqs, %s outgroups",
                            len(seqs), len(outs))
                    new_task_node = Msf(seqs, outs, seqtype=source_seqtype)
                    new_task_node.target_wkname = wkname
                    new_tasks.append(new_task_node)
    return new_tasks