def control_c(_signal, _frame): signal.signal(signal.SIGINT, signal.SIG_IGN) db.commit() ver = {28: "0", 26: "1", 24: "2", 22: "3", 20: "4", 10: "5"} ver_level = log.level print('\n\nYou pressed Ctrl+C!') print('q) quit') print('v) change verbosity level:', ver.get(ver_level, ver_level)) print('d) enter debug mode') print('c) continue execution') key = ask(" Choose:", ["q", "v", "d", "c"]) if key == "q": raise KeyboardInterrupt elif key == "d": signal.signal(signal.SIGALRM, debug) signal.alarm(1) return elif key == "v": vl = ask("new level", sorted(ver.values())) new_level = sorted(list(ver.keys()), reverse=True)[int(vl)] log.setLevel(new_level) elif key == "d": import pdb pdb.set_trace() signal.signal(signal.SIGINT, control_c)
def process_task(task, wkname, npr_conf, nodeid2info): cogconf, cogclass = npr_conf.cog_selector concatconf, concatclass = npr_conf.alg_concatenator treebuilderconf, treebuilderclass = npr_conf.tree_builder splitterconf, splitterclass = npr_conf.tree_splitter threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid, task.seqtype, task.ttype) cladeid, targets, outgroups = db.get_node_info(threadid, nodeid) if not treebuilderclass or task.size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree if outgroups and len(outgroups) > 1: constrain_id = nodeid else: constrain_id = None node_info = nodeid2info[nodeid] conf = GLOBALS[task.configid] new_tasks = [] if ttype == "cog_selector": # Generates a md5 id based on the genetree configuration workflow used # for the concat alg task. If something changes, concat alg will change # and the associated tree will be rebuilt config_blocks = set([wkname]) for key, value in six.iteritems(conf[wkname]): if isinstance(value, list) or isinstance(value, tuple) \ or isinstance(value, set): for elem in value: config_blocks.add(elem[1:]) if isinstance( elem, str) and elem.startswith("@") else None elif isinstance(value, str): config_blocks.add(value[1:]) if value.startswith("@") else None config_checksum = md5(''.join([ "[%s]\n%s" % (x, dict_string(conf[x])) for x in sorted(config_blocks) ])) # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK # Check that current selection of cogs will cover all target and # outgroup species #cog_hard_limit = int(conf[concatconf]["_max_cogs"]) #sp_repr = defaultdict(int) #for co in task.raw_cogs[:cog_hard_limit]: # for sp, seq in co: # sp_repr[sp] += 1 #missing_sp = (targets | outgroups) - set(sp_repr.keys()) #if missing_sp: # raise TaskError("missing species under current cog selection: %s" %missing_sp) #else: # log.log(28, "Analysis of current COG selection:") # for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]): # log.log(28, " % 30s species present in % 6d COGs" %(sp, ncogs)) # register concat alignment task. NodeId associated to concat_alg tasks # and all its children jobs should take into account cog information and # not only species and outgroups included. concat_job = concatclass(task.cogs, seqtype, conf, concatconf, config_checksum) db.add_node(threadid, concat_job.nodeid, cladeid, targets, outgroups) # Register Tree constrains constrain_tree = "(%s, (%s));" % (','.join( sorted(outgroups)), ','.join(sorted(targets))) _outs = "\n".join([">%s\n0" % name for name in sorted(outgroups)]) _tars = "\n".join([">%s\n1" % name for name in sorted(targets)]) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task objects # may require this info, I need to commit # right now. concat_job.size = task.size new_tasks.append(concat_job) elif ttype == "concat_alg": # register tree for concat alignment, using constraint tree if # necessary alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip) try: parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions) except ValueError: parts_id = None nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = targets nodeid2info[nodeid]["out_seqs"] = outgroups tree_task = treebuilderclass(nodeid, alg_id, constrain_id, None, seqtype, conf, treebuilderconf, parts_id=parts_id) tree_task.size = task.size new_tasks.append(tree_task) elif ttype == "tree": merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf) merger_task.size = task.size new_tasks.append(merger_task) elif ttype == "treemerger": # Lets merge with main tree if not task.task_tree: task.finish() log.log(24, "Saving task tree...") annotate_node(task.task_tree, task) db.update_node(nid=task.nodeid, runid=task.threadid, newick=db.encode(task.task_tree)) db.commit() if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1: current_iter = get_iternumber(threadid) if npr_conf.max_iters and current_iter >= npr_conf.max_iters: log.warning("Maximum number of iterations reached!") else: # Add new nodes source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt" ttree, mtree = task.task_tree, task.main_tree log.log(26, "Processing tree: %s seqs, %s outgroups", len(targets), len(outgroups)) target_cladeids = None if tobool(conf[splitterconf].get("_find_ncbi_targets", False)): tcopy = mtree.copy() ncbi.connect_database() tax2name, tax2track = ncbi.annotate_tree_with_taxa( tcopy, None) #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example n2content = tcopy.get_cached_content() broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches( tcopy, n2content) log.log( 28, 'restricting NPR to broken clades: ' + colorify( ', '.join( ["%s" % tax2name[x] for x in broken_clades]), "wr")) target_cladeids = set() for branch in broken_branches: print( branch.get_ascii(attributes=['spname', 'taxid'], compact=True)) print([ "%s" % tax2name[x] for x in broken_branches[branch] ]) target_cladeids.add(branch.cladeid) for node, seqs, outs, wkname in get_next_npr_node( task.configid, ttree, task.out_seqs, mtree, None, npr_conf, target_cladeids): # None is to avoid alg checks log.log(24, "Adding new node: %s seqs, %s outgroups", len(seqs), len(outs)) new_task_node = cogclass(seqs, outs, source_seqtype, conf, cogconf) new_task_node.target_wkname = wkname new_tasks.append(new_task_node) db.add_node(threadid, new_task_node.nodeid, new_task_node.cladeid, new_task_node.targets, new_task_node.outgroups) return new_tasks
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender): # Adjust debug mode if debug == "all": log.setLevel(10) pending_tasks = set(pending_tasks) ## =================================== ## INITIALIZE BASIC VARS execution, run_detached = execution thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) expected_threads = set(thread2tasks.keys()) past_threads = {} thread_errors = defaultdict(list) ## END OF VARS AND SHORTCUTS ## =================================== cores_total = GLOBALS["_max_cores"] if cores_total > 0: job_queue = Queue() back_launcher = Process(target=background_job_launcher, args=(job_queue, run_detached, GLOBALS["launch_time"], cores_total)) back_launcher.start() else: job_queue = None back_launcher = None GLOBALS["_background_scheduler"] = back_launcher GLOBALS["_job_queue"] = job_queue # Captures Ctrl-C for debuging DEBUG #signal.signal(signal.SIGINT, control_c) last_report_time = None BUG = set() try: # Enters into task scheduling while pending_tasks: wtime = schedule_time # ask SGE for running jobs if execution == "sge": #sgeid2jobs = db.get_sge_tasks() #qstat_jobs = sge.qstat() pass else: qstat_jobs = None # Show summary of pending tasks per thread thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) set_logindent(0) log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime())) info_lines = [] for tid, tlist in six.iteritems(thread2tasks): threadname = GLOBALS[tid]["_name"] sizelist = ["%s" % getattr(_ts, "size", "?") for _ts in tlist] info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" % ( threadname, len(tlist), ', '.join(sizelist)) info_lines.append(info) for line in info_lines: log.log(28, line) if GLOBALS["email"] and last_report_time is None: last_report_time = time() send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines)) ## ================================ ## CHECK AND UPDATE CURRENT TASKS checked_tasks = set() check_start_time = time() to_add_tasks = set() GLOBALS["cached_status"] = {} for task in sorted(pending_tasks, sort_tasks): # Avoids endless periods without new job submissions elapsed_time = time() - check_start_time #if not back_launcher and pending_tasks and \ # elapsed_time > schedule_time * 2: # log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:") # db.commit() # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # check_start_time = time() # Enter debuging mode if necessary if debug and log.level > 10 and task.taskid.startswith(debug): log.setLevel(10) log.debug("ENTERING IN DEBUGGING MODE") thread2tasks[task.configid].append(task) # Update tasks and job statuses if task.taskid not in checked_tasks: try: show_task_info(task) task.status = task.get_status(qstat_jobs) db.dataconn.commit() if back_launcher and task.status not in set("DE"): for j, cmd in task.iter_waiting_jobs(): j.status = "Q" GLOBALS["cached_status"][j.jobid] = "Q" if j.jobid not in BUG: if not os.path.exists(j.jobdir): os.makedirs(j.jobdir) for ifile, outpath in six.iteritems( j.input_files): try: _tid, _did = ifile.split(".") _did = int(_did) except (IndexError, ValueError): dataid = ifile else: dataid = db.get_dataid(_tid, _did) if not outpath: outfile = pjoin( GLOBALS["input_dir"], ifile) else: outfile = pjoin(outpath, ifile) if not os.path.exists(outfile): open(outfile, "w").write( db.get_data(dataid)) log.log( 24, " @@8:Queueing @@1: %s from %s" % (j, task)) if execution: job_queue.put([ j.jobid, j.cores, cmd, j.status_file ]) BUG.add(j.jobid) update_task_states_recursively(task) db.commit() checked_tasks.add(task.taskid) except TaskError as e: log.error("Errors found in %s" % task) import traceback traceback.print_exc() if GLOBALS["email"]: threadname = GLOBALS[task.configid]["_name"] send_mail( GLOBALS["email"], "Errors found in %s!" % threadname, '\n'.join(map(str, [task, e.value, e.msg]))) pending_tasks.discard(task) thread_errors[task.configid].append( [task, e.value, e.msg]) continue else: # Set temporary Queued state to avoids launching # jobs from clones task.status = "Q" if log.level < 24: show_task_info(task) if task.status == "D": #db.commit() show_task_info(task) logindent(3) # Log commands of every task if 'cmd_log_file' not in GLOBALS[task.configid]: GLOBALS[task.configid]['cmd_log_file'] = pjoin( GLOBALS[task.configid]["_outpath"], "cmd.log") O = open(GLOBALS[task.configid]['cmd_log_file'], "w") O.close() cmd_lines = get_cmd_log(task) CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a") print(task, file=CMD_LOG) for c in cmd_lines: print(' ' + '\t'.join(map(str, c)), file=CMD_LOG) CMD_LOG.close() # try: #wkname = GLOBALS[task.configid]['_name'] create_tasks = workflow_task_processor( task, task.target_wkname) except TaskError as e: log.error("Errors found in %s" % task) pending_tasks.discard(task) thread_errors[task.configid].append( [task, e.value, e.msg]) continue else: logindent(-3) to_add_tasks.update(create_tasks) pending_tasks.discard(task) elif task.status == "E": log.error("task contains errors: %s " % task) log.error("Errors found in %s") pending_tasks.discard(task) thread_errors[task.configid].append( [task, None, "Found (E) task status"]) #db.commit() #if not back_launcher: # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # Update global task list with recently added jobs to be check # during next cycle pending_tasks.update(to_add_tasks) ## END CHECK AND UPDATE CURRENT TASKS ## ================================ if wtime: set_logindent(0) log.log(28, "@@13:Waiting %s seconds@@1:" % wtime) sleep(wtime) else: sleep(schedule_time) # Dump / show ended threads error_lines = [] for configid, etasks in six.iteritems(thread_errors): error_lines.append("Thread @@10:%s@@1: contains errors:" %\ (GLOBALS[configid]["_name"])) for error in etasks: error_lines.append(" ** %s" % error[0]) e_obj = error[1] if error[1] else error[0] error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid if e_obj is not error[0]: error_lines.append(" -> %s" % e_obj) error_lines.append(" -> %s" % error_path) error_lines.append(" -> %s" % error[2]) for eline in error_lines: log.error(eline) pending_threads = set([ts.configid for ts in pending_tasks]) finished_threads = expected_threads - (pending_threads | set(thread_errors.keys())) just_finished_lines = [] finished_lines = [] for configid in finished_threads: # configid is the the same as threadid in master tasks final_tree_file = pjoin(GLOBALS[configid]["_outpath"], GLOBALS["inputname"] + ".final_tree") threadname = GLOBALS[configid]["_name"] if configid in past_threads: log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) finished_lines.append("Finished %s in %d iteration(s)" % (threadname, past_threads[configid])) else: log.log(28, "Assembling final tree...") main_tree, treeiters = assembly_tree(configid) past_threads[configid] = treeiters - 1 log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) log.log( 28, "Writing final tree for @@13:%s@@1:\n %s\n %s", threadname, final_tree_file + ".nw", final_tree_file + ".nwx (newick extended)") main_tree.write(outfile=final_tree_file + ".nw") main_tree.write(outfile=final_tree_file + ".nwx", features=[], format_root_node=True) if hasattr(main_tree, "alg_path"): log.log( 28, "Writing root node alignment @@13:%s@@1:\n %s", threadname, final_tree_file + ".fa") alg = SeqGroup(get_stored_data(main_tree.alg_path)) OUT = open(final_tree_file + ".fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print(">%s\n%s" % (realname, seq), file=OUT) OUT.close() if hasattr(main_tree, "clean_alg_path"): log.log( 28, "Writing root node trimmed alignment @@13:%s@@1:\n %s", threadname, final_tree_file + ".trimmed.fa") alg = SeqGroup( get_stored_data(main_tree.clean_alg_path)) OUT = open(final_tree_file + ".trimmed.fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print(">%s\n%s" % (realname, seq), file=OUT) OUT.close() if norender == False: log.log( 28, "Generating tree image for @@13:%s@@1:\n %s", threadname, final_tree_file + ".png") for lf in main_tree: lf.add_feature("sequence", alg.get_seq(lf.safename)) try: from ete3.tools.phylobuild_lib.visualize import draw_tree draw_tree(main_tree, GLOBALS[configid], final_tree_file + ".png") except Exception as e: log.warning( '@@8:something went wrong when generating the tree image. Try manually :(@@1:' ) if DEBUG: import traceback, sys traceback.print_exc(file=sys.stdout) just_finished_lines.append( "Finished %s in %d iteration(s)" % (threadname, past_threads[configid])) if GLOBALS["email"]: if not pending_tasks: all_lines = finished_lines + just_finished_lines + error_lines send_mail(GLOBALS["email"], "Your NPR process has ended", '\n'.join(all_lines)) elif GLOBALS["email_report_time"] and time() - last_report_time >= \ GLOBALS["email_report_time"]: all_lines = info_lines + error_lines + just_finished_lines send_mail(GLOBALS["email"], "Your NPR report", '\n'.join(all_lines)) last_report_time = time() elif just_finished_lines: send_mail(GLOBALS["email"], "Finished threads!", '\n'.join(just_finished_lines)) log.log(26, "") except: raise if thread_errors: log.error("Done with ERRORS") else: log.log(28, "Done") return thread_errors
def process_task(task, wkname, npr_conf, nodeid2info): cogconf, cogclass = npr_conf.cog_selector concatconf, concatclass = npr_conf.alg_concatenator treebuilderconf, treebuilderclass = npr_conf.tree_builder splitterconf, splitterclass = npr_conf.tree_splitter threadid, nodeid, seqtype, ttype = (task.threadid, task.nodeid, task.seqtype, task.ttype) cladeid, targets, outgroups = db.get_node_info(threadid, nodeid) if not treebuilderclass or task.size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree if outgroups and len(outgroups) > 1: constrain_id = nodeid else: constrain_id = None node_info = nodeid2info[nodeid] conf = GLOBALS[task.configid] new_tasks = [] if ttype == "cog_selector": # Generates a md5 id based on the genetree configuration workflow used # for the concat alg task. If something changes, concat alg will change # and the associated tree will be rebuilt config_blocks = set([wkname]) for key, value in six.iteritems(conf[wkname]): if isinstance(value, list) or isinstance(value, tuple) \ or isinstance(value, set): for elem in value: config_blocks.add(elem[1:]) if isinstance(elem, str) and elem.startswith("@") else None elif isinstance(value, str): config_blocks.add(value[1:]) if value.startswith("@") else None config_checksum = md5(''.join(["[%s]\n%s" %(x, dict_string(conf[x])) for x in sorted(config_blocks)])) # THIS PART HAS BEEN MOVED TO COG_SELECTOR TASK # Check that current selection of cogs will cover all target and # outgroup species #cog_hard_limit = int(conf[concatconf]["_max_cogs"]) #sp_repr = defaultdict(int) #for co in task.raw_cogs[:cog_hard_limit]: # for sp, seq in co: # sp_repr[sp] += 1 #missing_sp = (targets | outgroups) - set(sp_repr.keys()) #if missing_sp: # raise TaskError("missing species under current cog selection: %s" %missing_sp) #else: # log.log(28, "Analysis of current COG selection:") # for sp, ncogs in sorted(sp_repr.items(), key=lambda x:x[1]): # log.log(28, " % 30s species present in % 6d COGs" %(sp, ncogs)) # register concat alignment task. NodeId associated to concat_alg tasks # and all its children jobs should take into account cog information and # not only species and outgroups included. concat_job = concatclass(task.cogs, seqtype, conf, concatconf, config_checksum) db.add_node(threadid, concat_job.nodeid, cladeid, targets, outgroups) # Register Tree constrains constrain_tree = "(%s, (%s));" %(','.join(sorted(outgroups)), ','.join(sorted(targets))) _outs = "\n".join([">%s\n0" %name for name in sorted(outgroups)]) _tars = "\n".join([">%s\n1" %name for name in sorted(targets)]) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(concat_job.nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task objects # may require this info, I need to commit # right now. concat_job.size = task.size new_tasks.append(concat_job) elif ttype == "concat_alg": # register tree for concat alignment, using constraint tree if # necessary alg_id = db.get_dataid(task.taskid, DATATYPES.concat_alg_phylip) try: parts_id = db.get_dataid(task.taskid, DATATYPES.model_partitions) except ValueError: parts_id = None nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = targets nodeid2info[nodeid]["out_seqs"] = outgroups tree_task = treebuilderclass(nodeid, alg_id, constrain_id, None, seqtype, conf, treebuilderconf, parts_id=parts_id) tree_task.size = task.size new_tasks.append(tree_task) elif ttype == "tree": merger_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf) merger_task.size = task.size new_tasks.append(merger_task) elif ttype == "treemerger": # Lets merge with main tree if not task.task_tree: task.finish() log.log(24, "Saving task tree...") annotate_node(task.task_tree, task) db.update_node(nid=task.nodeid, runid=task.threadid, newick=db.encode(task.task_tree)) db.commit() if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1: current_iter = get_iternumber(threadid) if npr_conf.max_iters and current_iter >= npr_conf.max_iters: log.warning("Maximum number of iterations reached!") else: # Add new nodes source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt" ttree, mtree = task.task_tree, task.main_tree log.log(26, "Processing tree: %s seqs, %s outgroups", len(targets), len(outgroups)) target_cladeids = None if tobool(conf[splitterconf].get("_find_ncbi_targets", False)): tcopy = mtree.copy() ncbi.connect_database() tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, None) #tax2name, tax2track = ncbi.annotate_tree_with_taxa(tcopy, "fake") # for testing sptree example n2content = tcopy.get_cached_content() broken_branches, broken_clades, broken_clade_sizes, tax2name = ncbi.get_broken_branches(tcopy, n2content) log.log(28, 'restricting NPR to broken clades: '+ colorify(', '.join(["%s"%tax2name[x] for x in broken_clades]), "wr")) target_cladeids = set() for branch in broken_branches: print(branch.get_ascii(attributes=['spname', 'taxid'], compact=True)) print(["%s"%tax2name[x] for x in broken_branches[branch]]) target_cladeids.add(branch.cladeid) for node, seqs, outs, wkname in get_next_npr_node(task.configid, ttree, task.out_seqs, mtree, None, npr_conf, target_cladeids): # None is to avoid alg checks log.log(24, "Adding new node: %s seqs, %s outgroups", len(seqs), len(outs)) new_task_node = cogclass(seqs, outs, source_seqtype, conf, cogconf) new_task_node.target_wkname = wkname new_tasks.append(new_task_node) db.add_node(threadid, new_task_node.nodeid, new_task_node.cladeid, new_task_node.targets, new_task_node.outgroups) return new_tasks
def process_task(task, wkname, npr_conf, nodeid2info): alignerconf, alignerclass = npr_conf.aligner cleanerconf, cleanerclass = npr_conf.alg_cleaner mtesterconf, mtesterclass = npr_conf.model_tester treebuilderconf, treebuilderclass = npr_conf.tree_builder if not treebuilderclass: # Allows to dump algs in workflows with no tree tasks treebuilderclass = DummyTree splitterconf, splitterclass = npr_conf.tree_splitter conf = GLOBALS[task.configid] seqtype = task.seqtype nodeid = task.nodeid ttype = task.ttype taskid = task.taskid threadid = task.threadid node_info = nodeid2info[nodeid] size = task.size #node_info.get("size", 0) target_seqs = node_info.get("target_seqs", []) out_seqs = node_info.get("out_seqs", []) if not treebuilderclass or size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree # If more than one outgroup are used, enable the use of constrain if out_seqs and len(out_seqs) > 1: constrain_id = nodeid else: constrain_id = None new_tasks = [] if ttype == "msf": # Register Tree constrains constrain_tree = "(%s, (%s));" % (','.join(sorted( task.out_seqs)), ','.join(sorted(task.target_seqs))) _outs = "\n".join([">%s\n0" % name for name in sorted(task.out_seqs)]) _tars = "\n".join( [">%s\n1" % name for name in sorted(task.target_seqs)]) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task # objects may require this info, I need # to commit right now. # Register node db.add_node(task.threadid, task.nodeid, task.cladeid, task.target_seqs, task.out_seqs) nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = task.target_seqs nodeid2info[nodeid]["out_seqs"] = task.out_seqs alg_task = alignerclass(nodeid, task.multiseq_file, seqtype, conf, alignerconf) alg_task.size = task.size new_tasks.append(alg_task) elif ttype == "alg" or ttype == "acleaner": if ttype == "alg": nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file elif ttype == "acleaner": nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file alg_fasta_file = getattr(task, "clean_alg_fasta_file", task.alg_fasta_file) alg_phylip_file = getattr(task, "clean_alg_phylip_file", task.alg_phylip_file) # Calculate alignment stats # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file, # conf["app"]["trimal"]) # # max_identity = get_trimal_identity(task.alg_fasta_file, # conf["app"]["trimal"]) # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std) # log.info("Max. Identity: %0.2f", max_identity) #import time #t1 = time.time() #mx, mn, mean, std = get_identity(task.alg_fasta_file) #print time.time()-t1 #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) #t1 = time.time() if seqtype == "aa" and npr_conf.switch_aa_similarity < 1: try: alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats) except Exception as e: alg_stats = {} if ttype == "alg": algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) dataid = DATATYPES.alg_phylip elif ttype == "acleaner": algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file) dataid = DATATYPES.clean_alg_phylip if "i_mean" not in alg_stats: log.log(24, "Calculating alignment stats...") # dump data if necesary algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) if not pexist(algfile): # dump phylip alg open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid))) mx, mn, mean, std = get_statal_identity( algfile, conf["app"]["statal"]) alg_stats = { "i_max": mx, "i_mean": mean, "i_min": mn, "i_std": std } db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats) log.log(22, "Alignment stats (sequence similarity):") log.log( 22, " max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" % (alg_stats)) else: alg_stats = {"i_max": -1, "i_mean": -1, "i_min": -1, "i_std": -1} #print time.time()-t1 #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) task.max_ident = alg_stats["i_max"] task.min_ident = alg_stats["i_min"] task.mean_ident = alg_stats["i_mean"] task.std_ident = alg_stats["i_std"] next_task = None if ttype == "alg" and cleanerclass: next_task = cleanerclass(nodeid, seqtype, alg_fasta_file, alg_phylip_file, conf, cleanerconf) else: # Converts aa alignment into nt if necessary if seqtype == "aa" and \ "nt" in GLOBALS["seqtypes"] and \ task.mean_ident >= npr_conf.switch_aa_similarity: log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\ (task.mean_ident, npr_conf.switch_aa_similarity)) alg_fasta_file = "%s.%s" % (taskid, DATATYPES.alg_nt_fasta) alg_phylip_file = "%s.%s" % (taskid, DATATYPES.alg_nt_phylip) try: alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta) alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip) except ValueError: log.log(22, "Calculating codon alignment...") source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file) if ttype == "alg": kept_columns = [] elif ttype == "acleaner": # if original alignment was trimmed, use it as reference # but make the nt alignment only on the kept columns kept_columns = db.get_task_data( taskid, DATATYPES.kept_alg_columns) if not pexist(source_alg): open(source_alg, "w").write( db.get_task_data(taskid, DATATYPES.alg_fasta)) nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns) db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write()) db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed')) npr_conf = IterConfig(conf, wkname, task.size, "nt") seqtype = "nt" if mtesterclass: next_task = mtesterclass(nodeid, alg_fasta_file, alg_phylip_file, constrain_id, conf, mtesterconf) elif treebuilderclass: next_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id, None, seqtype, conf, treebuilderconf) if next_task: next_task.size = task.size new_tasks.append(next_task) elif ttype == "mchooser": if treebuilderclass: alg_fasta_file = task.alg_fasta_file alg_phylip_file = task.alg_phylip_file model = task.best_model tree_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id, model, seqtype, conf, treebuilderconf) tree_task.size = task.size new_tasks.append(tree_task) elif ttype == "tree": treemerge_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf) #if conf["tree_splitter"]["_outgroup_size"]: # treemerge_task = TreeSplitterWithOutgroups(nodeid, seqtype, task.tree_file, main_tree, conf) #else: # treemerge_task = TreeSplitter(nodeid, seqtype, task.tree_file, main_tree, conf) treemerge_task.size = task.size new_tasks.append(treemerge_task) elif ttype == "treemerger": if not task.task_tree: task.finish() log.log(24, "Saving task tree...") annotate_node(task.task_tree, task) db.update_node(nid=task.nodeid, runid=task.threadid, newick=db.encode(task.task_tree)) db.commit() if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1: current_iter = get_iternumber(threadid) if npr_conf.max_iters and current_iter >= npr_conf.max_iters: log.warning("Maximum number of iterations reached!") else: # Add new nodes source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt" ttree, mtree = task.task_tree, task.main_tree log.log(26, "Processing tree: %s seqs, %s outgroups", len(target_seqs), len(out_seqs)) alg_path = node_info.get("clean_alg_path", node_info["alg_path"]) for node, seqs, outs, wkname in get_next_npr_node( threadid, ttree, task.out_seqs, mtree, alg_path, npr_conf): log.log(24, "Registering new node: %s seqs, %s outgroups", len(seqs), len(outs)) new_task_node = Msf(seqs, outs, seqtype=source_seqtype) new_task_node.target_wkname = wkname new_tasks.append(new_task_node) return new_tasks
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender): # Adjust debug mode if debug == "all": log.setLevel(10) pending_tasks = set(pending_tasks) ## =================================== ## INITIALIZE BASIC VARS execution, run_detached = execution thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) expected_threads = set(thread2tasks.keys()) past_threads = {} thread_errors = defaultdict(list) ## END OF VARS AND SHORTCUTS ## =================================== cores_total = GLOBALS["_max_cores"] if cores_total > 0: job_queue = Queue() back_launcher = Process(target=background_job_launcher, args=(job_queue, run_detached, GLOBALS["launch_time"], cores_total)) back_launcher.start() else: job_queue = None back_launcher = None GLOBALS["_background_scheduler"] = back_launcher GLOBALS["_job_queue"] = job_queue # Captures Ctrl-C for debuging DEBUG #signal.signal(signal.SIGINT, control_c) last_report_time = None BUG = set() try: # Enters into task scheduling while pending_tasks: wtime = schedule_time # ask SGE for running jobs if execution == "sge": #sgeid2jobs = db.get_sge_tasks() #qstat_jobs = sge.qstat() pass else: qstat_jobs = None # Show summary of pending tasks per thread thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) set_logindent(0) log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime())) info_lines = [] for tid, tlist in six.iteritems(thread2tasks): threadname = GLOBALS[tid]["_name"] sizelist = ["%s" %getattr(_ts, "size", "?") for _ts in tlist] info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" %( threadname, len(tlist), ', '.join(sizelist)) info_lines.append(info) for line in info_lines: log.log(28, line) if GLOBALS["email"] and last_report_time is None: last_report_time = time() send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines)) ## ================================ ## CHECK AND UPDATE CURRENT TASKS checked_tasks = set() check_start_time = time() to_add_tasks = set() GLOBALS["cached_status"] = {} for task in sorted(pending_tasks, sort_tasks): # Avoids endless periods without new job submissions elapsed_time = time() - check_start_time #if not back_launcher and pending_tasks and \ # elapsed_time > schedule_time * 2: # log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:") # db.commit() # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # check_start_time = time() # Enter debuging mode if necessary if debug and log.level > 10 and task.taskid.startswith(debug): log.setLevel(10) log.debug("ENTERING IN DEBUGGING MODE") thread2tasks[task.configid].append(task) # Update tasks and job statuses if task.taskid not in checked_tasks: try: show_task_info(task) task.status = task.get_status(qstat_jobs) db.dataconn.commit() if back_launcher and task.status not in set("DE"): for j, cmd in task.iter_waiting_jobs(): j.status = "Q" GLOBALS["cached_status"][j.jobid] = "Q" if j.jobid not in BUG: if not os.path.exists(j.jobdir): os.makedirs(j.jobdir) for ifile, outpath in six.iteritems(j.input_files): try: _tid, _did = ifile.split(".") _did = int(_did) except (IndexError, ValueError): dataid = ifile else: dataid = db.get_dataid(_tid, _did) if not outpath: outfile = pjoin(GLOBALS["input_dir"], ifile) else: outfile = pjoin(outpath, ifile) if not os.path.exists(outfile): open(outfile, "w").write(db.get_data(dataid)) log.log(24, " @@8:Queueing @@1: %s from %s" %(j, task)) if execution: job_queue.put([j.jobid, j.cores, cmd, j.status_file]) BUG.add(j.jobid) update_task_states_recursively(task) db.commit() checked_tasks.add(task.taskid) except TaskError as e: log.error("Errors found in %s" %task) import traceback traceback.print_exc() if GLOBALS["email"]: threadname = GLOBALS[task.configid]["_name"] send_mail(GLOBALS["email"], "Errors found in %s!" %threadname, '\n'.join(map(str, [task, e.value, e.msg]))) pending_tasks.discard(task) thread_errors[task.configid].append([task, e.value, e.msg]) continue else: # Set temporary Queued state to avoids launching # jobs from clones task.status = "Q" if log.level < 24: show_task_info(task) if task.status == "D": #db.commit() show_task_info(task) logindent(3) # Log commands of every task if 'cmd_log_file' not in GLOBALS[task.configid]: GLOBALS[task.configid]['cmd_log_file'] = pjoin(GLOBALS[task.configid]["_outpath"], "cmd.log") O = open(GLOBALS[task.configid]['cmd_log_file'], "w") O.close() cmd_lines = get_cmd_log(task) CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a") print(task, file=CMD_LOG) for c in cmd_lines: print(' '+'\t'.join(map(str, c)), file=CMD_LOG) CMD_LOG.close() # try: #wkname = GLOBALS[task.configid]['_name'] create_tasks = workflow_task_processor(task, task.target_wkname) except TaskError as e: log.error("Errors found in %s" %task) pending_tasks.discard(task) thread_errors[task.configid].append([task, e.value, e.msg]) continue else: logindent(-3) to_add_tasks.update(create_tasks) pending_tasks.discard(task) elif task.status == "E": log.error("task contains errors: %s " %task) log.error("Errors found in %s") pending_tasks.discard(task) thread_errors[task.configid].append([task, None, "Found (E) task status"]) #db.commit() #if not back_launcher: # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # Update global task list with recently added jobs to be check # during next cycle pending_tasks.update(to_add_tasks) ## END CHECK AND UPDATE CURRENT TASKS ## ================================ if wtime: set_logindent(0) log.log(28, "@@13:Waiting %s seconds@@1:" %wtime) sleep(wtime) else: sleep(schedule_time) # Dump / show ended threads error_lines = [] for configid, etasks in six.iteritems(thread_errors): error_lines.append("Thread @@10:%s@@1: contains errors:" %\ (GLOBALS[configid]["_name"])) for error in etasks: error_lines.append(" ** %s" %error[0]) e_obj = error[1] if error[1] else error[0] error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid if e_obj is not error[0]: error_lines.append(" -> %s" %e_obj) error_lines.append(" -> %s" %error_path) error_lines.append(" -> %s" %error[2]) for eline in error_lines: log.error(eline) pending_threads = set([ts.configid for ts in pending_tasks]) finished_threads = expected_threads - (pending_threads | set(thread_errors.keys())) just_finished_lines = [] finished_lines = [] for configid in finished_threads: # configid is the the same as threadid in master tasks final_tree_file = pjoin(GLOBALS[configid]["_outpath"], GLOBALS["inputname"] + ".final_tree") threadname = GLOBALS[configid]["_name"] if configid in past_threads: log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) finished_lines.append("Finished %s in %d iteration(s)" %( threadname, past_threads[configid])) else: log.log(28, "Assembling final tree...") main_tree, treeiters = assembly_tree(configid) past_threads[configid] = treeiters - 1 log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) log.log(28, "Writing final tree for @@13:%s@@1:\n %s\n %s", threadname, final_tree_file+".nw", final_tree_file+".nwx (newick extended)") main_tree.write(outfile=final_tree_file+".nw") main_tree.write(outfile=final_tree_file+ ".nwx", features=[], format_root_node=True) if hasattr(main_tree, "alg_path"): log.log(28, "Writing root node alignment @@13:%s@@1:\n %s", threadname, final_tree_file+".fa") alg = SeqGroup(get_stored_data(main_tree.alg_path)) OUT = open(final_tree_file+".fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print(">%s\n%s" %(realname, seq), file=OUT) OUT.close() if hasattr(main_tree, "clean_alg_path"): log.log(28, "Writing root node trimmed alignment @@13:%s@@1:\n %s", threadname, final_tree_file+".trimmed.fa") alg = SeqGroup(get_stored_data(main_tree.clean_alg_path)) OUT = open(final_tree_file+".trimmed.fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print(">%s\n%s" %(realname, seq), file=OUT) OUT.close() if norender == False: log.log(28, "Generating tree image for @@13:%s@@1:\n %s", threadname, final_tree_file+".png") for lf in main_tree: lf.add_feature("sequence", alg.get_seq(lf.safename)) try: from ete3.tools.phylobuild_lib.visualize import draw_tree draw_tree(main_tree, GLOBALS[configid], final_tree_file+".png") except Exception as e: log.warning('@@8:something went wrong when generating the tree image. Try manually :(@@1:') if DEBUG: import traceback, sys traceback.print_exc(file=sys.stdout) just_finished_lines.append("Finished %s in %d iteration(s)" %( threadname, past_threads[configid])) if GLOBALS["email"]: if not pending_tasks: all_lines = finished_lines + just_finished_lines + error_lines send_mail(GLOBALS["email"], "Your NPR process has ended", '\n'.join(all_lines)) elif GLOBALS["email_report_time"] and time() - last_report_time >= \ GLOBALS["email_report_time"]: all_lines = info_lines + error_lines + just_finished_lines send_mail(GLOBALS["email"], "Your NPR report", '\n'.join(all_lines)) last_report_time = time() elif just_finished_lines: send_mail(GLOBALS["email"], "Finished threads!", '\n'.join(just_finished_lines)) log.log(26, "") except: raise if thread_errors: log.error("Done with ERRORS") else: log.log(28, "Done") return thread_errors
def process_task(task, wkname, npr_conf, nodeid2info): alignerconf, alignerclass = npr_conf.aligner cleanerconf, cleanerclass = npr_conf.alg_cleaner mtesterconf, mtesterclass = npr_conf.model_tester treebuilderconf, treebuilderclass = npr_conf.tree_builder if not treebuilderclass: # Allows to dump algs in workflows with no tree tasks treebuilderclass = DummyTree splitterconf, splitterclass = npr_conf.tree_splitter conf = GLOBALS[task.configid] seqtype = task.seqtype nodeid = task.nodeid ttype = task.ttype taskid = task.taskid threadid = task.threadid node_info = nodeid2info[nodeid] size = task.size#node_info.get("size", 0) target_seqs = node_info.get("target_seqs", []) out_seqs = node_info.get("out_seqs", []) if not treebuilderclass or size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree # If more than one outgroup are used, enable the use of constrain if out_seqs and len(out_seqs) > 1: constrain_id = nodeid else: constrain_id = None new_tasks = [] if ttype == "msf": # Register Tree constrains constrain_tree = "(%s, (%s));" %(','.join(sorted(task.out_seqs)), ','.join(sorted(task.target_seqs))) _outs = "\n".join([">%s\n0" %name for name in sorted(task.out_seqs)]) _tars = "\n".join([">%s\n1" %name for name in sorted(task.target_seqs)]) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task # objects may require this info, I need # to commit right now. # Register node db.add_node(task.threadid, task.nodeid, task.cladeid, task.target_seqs, task.out_seqs) nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = task.target_seqs nodeid2info[nodeid]["out_seqs"] = task.out_seqs alg_task = alignerclass(nodeid, task.multiseq_file, seqtype, conf, alignerconf) alg_task.size = task.size new_tasks.append(alg_task) elif ttype == "alg" or ttype == "acleaner": if ttype == "alg": nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file elif ttype == "acleaner": nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file alg_fasta_file = getattr(task, "clean_alg_fasta_file", task.alg_fasta_file) alg_phylip_file = getattr(task, "clean_alg_phylip_file", task.alg_phylip_file) # Calculate alignment stats # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file, # conf["app"]["trimal"]) # # max_identity = get_trimal_identity(task.alg_fasta_file, # conf["app"]["trimal"]) # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std) # log.info("Max. Identity: %0.2f", max_identity) #import time #t1 = time.time() #mx, mn, mean, std = get_identity(task.alg_fasta_file) #print time.time()-t1 #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) #t1 = time.time() if seqtype == "aa" and npr_conf.switch_aa_similarity < 1: try: alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats) except Exception as e: alg_stats = {} if ttype == "alg": algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) dataid = DATATYPES.alg_phylip elif ttype == "acleaner": algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file) dataid = DATATYPES.clean_alg_phylip if "i_mean" not in alg_stats: log.log(24, "Calculating alignment stats...") # dump data if necesary algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) if not pexist(algfile): # dump phylip alg open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid))) mx, mn, mean, std = get_statal_identity(algfile, conf["app"]["statal"]) alg_stats = {"i_max":mx, "i_mean":mean, "i_min":mn, "i_std":std} db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats) log.log(22, "Alignment stats (sequence similarity):") log.log(22, " max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" % (alg_stats)) else: alg_stats = {"i_max":-1, "i_mean":-1, "i_min":-1, "i_std":-1} #print time.time()-t1 #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) task.max_ident = alg_stats["i_max"] task.min_ident = alg_stats["i_min"] task.mean_ident = alg_stats["i_mean"] task.std_ident = alg_stats["i_std"] next_task = None if ttype == "alg" and cleanerclass: next_task = cleanerclass(nodeid, seqtype, alg_fasta_file, alg_phylip_file, conf, cleanerconf) else: # Converts aa alignment into nt if necessary if seqtype == "aa" and \ "nt" in GLOBALS["seqtypes"] and \ task.mean_ident >= npr_conf.switch_aa_similarity: log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\ (task.mean_ident, npr_conf.switch_aa_similarity)) alg_fasta_file = "%s.%s" %(taskid, DATATYPES.alg_nt_fasta) alg_phylip_file = "%s.%s" %(taskid, DATATYPES.alg_nt_phylip) try: alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta) alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip) except ValueError: log.log(22, "Calculating codon alignment...") source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file) if ttype == "alg": kept_columns = [] elif ttype == "acleaner": # if original alignment was trimmed, use it as reference # but make the nt alignment only on the kept columns kept_columns = db.get_task_data(taskid, DATATYPES.kept_alg_columns) if not pexist(source_alg): open(source_alg, "w").write(db.get_task_data(taskid, DATATYPES.alg_fasta)) nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns) db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write()) db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed')) npr_conf = IterConfig(conf, wkname, task.size, "nt") seqtype = "nt" if mtesterclass: next_task = mtesterclass(nodeid, alg_fasta_file, alg_phylip_file, constrain_id, conf, mtesterconf) elif treebuilderclass: next_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id, None, seqtype, conf, treebuilderconf) if next_task: next_task.size = task.size new_tasks.append(next_task) elif ttype == "mchooser": if treebuilderclass: alg_fasta_file = task.alg_fasta_file alg_phylip_file = task.alg_phylip_file model = task.best_model tree_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id, model, seqtype, conf, treebuilderconf) tree_task.size = task.size new_tasks.append(tree_task) elif ttype == "tree": treemerge_task = splitterclass(nodeid, seqtype, task.tree_file, conf, splitterconf) #if conf["tree_splitter"]["_outgroup_size"]: # treemerge_task = TreeSplitterWithOutgroups(nodeid, seqtype, task.tree_file, main_tree, conf) #else: # treemerge_task = TreeSplitter(nodeid, seqtype, task.tree_file, main_tree, conf) treemerge_task.size = task.size new_tasks.append(treemerge_task) elif ttype == "treemerger": if not task.task_tree: task.finish() log.log(24, "Saving task tree...") annotate_node(task.task_tree, task) db.update_node(nid=task.nodeid, runid=task.threadid, newick=db.encode(task.task_tree)) db.commit() if not isinstance(treebuilderclass, DummyTree) and npr_conf.max_iters > 1: current_iter = get_iternumber(threadid) if npr_conf.max_iters and current_iter >= npr_conf.max_iters: log.warning("Maximum number of iterations reached!") else: # Add new nodes source_seqtype = "aa" if "aa" in GLOBALS["seqtypes"] else "nt" ttree, mtree = task.task_tree, task.main_tree log.log(26, "Processing tree: %s seqs, %s outgroups", len(target_seqs), len(out_seqs)) alg_path = node_info.get("clean_alg_path", node_info["alg_path"]) for node, seqs, outs, wkname in get_next_npr_node(threadid, ttree, task.out_seqs, mtree, alg_path, npr_conf): log.log(24, "Registering new node: %s seqs, %s outgroups", len(seqs), len(outs)) new_task_node = Msf(seqs, outs, seqtype=source_seqtype) new_task_node.target_wkname = wkname new_tasks.append(new_task_node) return new_tasks