def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "alg.fasta")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgTask.store_data(self, fasta, phylip)
def switch_to_codon(alg_fasta_file, kept_columns=None): # Check conservation of columns. If too many identities, # switch to codon alignment and make the tree with DNA. # Mixed models is another possibility. if kept_columns: kept_columns = set(map(int, kept_columns)) else: kept_columns = [] #all_nt_alg = SeqGroup(nt_seed_file) aa_alg = SeqGroup(alg_fasta_file) nt_alg = SeqGroup() for seqname, aaseq, comments in aa_alg.iter_entries(): #ntseq = all_nt_alg.get_seq(seqname).upper() ntseq = db.get_seq(seqname, "nt").upper() ntalgseq = [] nt_pos = 0 for pos, ch in enumerate(aaseq): if ch in GAP_CHARS: codon = "---" else: codon = ntseq[nt_pos:nt_pos+3] nt_pos += 3 if not kept_columns or pos in kept_columns: # we trust the sequence in DB, consistency should have been # checked during the start up ntalgseq.append(codon) ntalgseq = "".join(ntalgseq) nt_alg.set_seq(seqname, ntalgseq) return nt_alg
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. Both files, fasta and phylip, # remain accessible. # Set Task specific attributes main_job = self.jobs[0] fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta") alg = SeqGroup(fasta_path) if len(alg) != self.size: log.warning( "Trimming was to aggressive and it tried" " to remove one or more sequences." " Alignment trimming will be disabled for this dataset.") self.clean_alg_fasta_file = db.register_task_data( self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file) self.clean_alg_phylip_file = db.register_task_data( self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file) else: for line in open(self.jobs[0].stdout_file): line = line.strip() if line.startswith("#ColumnsMap"): kept_columns = list( map(int, line.split("\t")[1].split(","))) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. final_job = self.jobs[2] alg = SeqGroup(os.path.join(final_job.jobdir, "alg.fasta")) alg.write(outfile=self.alg_fasta_file, format="fasta") alg.write(outfile=self.alg_phylip_file, format="iphylip_relaxed") AlgTask.finish(self)
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "mcoffee.fasta")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") alg_list_string = '\n'.join([pjoin(GLOBALS["input_dir"], aname) for aname in self.all_alg_files]) db.add_task_data(self.taskid, DATATYPES.alg_list, alg_list_string) AlgTask.store_data(self, fasta, phylip)
def finish(self): if self.conf[self.confname]["_alg_trimming"]: # If trimming happened after mcoffee, let's save the # resulting output trim_job = self.jobs[-1] alg = SeqGroup(pjoin(trim_job.jobdir, trim_job.alg_fasta_file)) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgTask.store_data(self, fasta, phylip) else: # If no post trimming, output is just what Mcoffee # produced, so we can recycle its data ids. mc_task = self.jobs[-1] fasta_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_fasta) phylip_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_phylip) db.register_task_data(self.taskid, DATATYPES.alg_fasta, fasta_id) db.register_task_data(self.taskid, DATATYPES.alg_phylip, phylip_id)
def switch_to_codon(alg_fasta_file, kept_columns=None): # Check conservation of columns. If too many identities, # switch to codon alignment and make the tree with DNA. # Mixed models is another possibility. if kept_columns: kept_columns = set(map(int, kept_columns)) else: kept_columns = [] #all_nt_alg = SeqGroup(nt_seed_file) aa_alg = SeqGroup(alg_fasta_file) nt_alg = SeqGroup() for seqname, aaseq, comments in aa_alg.iter_entries(): #ntseq = all_nt_alg.get_seq(seqname).upper() ntseq = db.get_seq(seqname, "nt").upper() ntalgseq = [] nt_pos = 0 for pos, ch in enumerate(aaseq): if ch in GAP_CHARS: codon = "---" else: codon = ntseq[nt_pos:nt_pos + 3] nt_pos += 3 if not kept_columns or pos in kept_columns: # we trust the sequence in DB, consistency should have been # checked during the start up ntalgseq.append(codon) ntalgseq = "".join(ntalgseq) nt_alg.set_seq(seqname, ntalgseq) return nt_alg
def get_identity(fname): s = SeqGroup(fname) seqlen = len(six.itervalues(s.id2seq)) ident = list() for i in range(seqlen): states = defaultdict(int) for seq in six.itervalues(s.id2seq): if seq[i] != "-": states[seq[i]] += 1 values = list(states.values()) if values: ident.append(float(max(values)) / sum(values)) return (_max(ident), _min(ident), _mean(ident), _std(ident))
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. Both files, fasta and phylip, # remain accessible. # Set Task specific attributes main_job = self.jobs[0] fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta") alg = SeqGroup(fasta_path) if len(alg) != self.size: log.warning("Trimming was to aggressive and it tried" " to remove one or more sequences." " Alignment trimming will be disabled for this dataset." ) self.clean_alg_fasta_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file) self.clean_alg_phylip_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file) else: for line in open(self.jobs[0].stdout_file): line = line.strip() if line.startswith("#ColumnsMap"): kept_columns = list(map(int, line.split("\t")[1].split(","))) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender): # Adjust debug mode if debug == "all": log.setLevel(10) pending_tasks = set(pending_tasks) ## =================================== ## INITIALIZE BASIC VARS execution, run_detached = execution thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) expected_threads = set(thread2tasks.keys()) past_threads = {} thread_errors = defaultdict(list) ## END OF VARS AND SHORTCUTS ## =================================== cores_total = GLOBALS["_max_cores"] if cores_total > 0: job_queue = Queue() back_launcher = Process(target=background_job_launcher, args=(job_queue, run_detached, GLOBALS["launch_time"], cores_total)) back_launcher.start() else: job_queue = None back_launcher = None GLOBALS["_background_scheduler"] = back_launcher GLOBALS["_job_queue"] = job_queue # Captures Ctrl-C for debuging DEBUG #signal.signal(signal.SIGINT, control_c) last_report_time = None BUG = set() try: # Enters into task scheduling while pending_tasks: wtime = schedule_time # ask SGE for running jobs if execution == "sge": #sgeid2jobs = db.get_sge_tasks() #qstat_jobs = sge.qstat() pass else: qstat_jobs = None # Show summary of pending tasks per thread thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) set_logindent(0) log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime())) info_lines = [] for tid, tlist in six.iteritems(thread2tasks): threadname = GLOBALS[tid]["_name"] sizelist = ["%s" % getattr(_ts, "size", "?") for _ts in tlist] info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" % ( threadname, len(tlist), ', '.join(sizelist)) info_lines.append(info) for line in info_lines: log.log(28, line) if GLOBALS["email"] and last_report_time is None: last_report_time = time() send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines)) ## ================================ ## CHECK AND UPDATE CURRENT TASKS checked_tasks = set() check_start_time = time() to_add_tasks = set() GLOBALS["cached_status"] = {} for task in sorted(pending_tasks, sort_tasks): # Avoids endless periods without new job submissions elapsed_time = time() - check_start_time #if not back_launcher and pending_tasks and \ # elapsed_time > schedule_time * 2: # log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:") # db.commit() # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # check_start_time = time() # Enter debuging mode if necessary if debug and log.level > 10 and task.taskid.startswith(debug): log.setLevel(10) log.debug("ENTERING IN DEBUGGING MODE") thread2tasks[task.configid].append(task) # Update tasks and job statuses if task.taskid not in checked_tasks: try: show_task_info(task) task.status = task.get_status(qstat_jobs) db.dataconn.commit() if back_launcher and task.status not in set("DE"): for j, cmd in task.iter_waiting_jobs(): j.status = "Q" GLOBALS["cached_status"][j.jobid] = "Q" if j.jobid not in BUG: if not os.path.exists(j.jobdir): os.makedirs(j.jobdir) for ifile, outpath in six.iteritems( j.input_files): try: _tid, _did = ifile.split(".") _did = int(_did) except (IndexError, ValueError): dataid = ifile else: dataid = db.get_dataid(_tid, _did) if not outpath: outfile = pjoin( GLOBALS["input_dir"], ifile) else: outfile = pjoin(outpath, ifile) if not os.path.exists(outfile): open(outfile, "w").write( db.get_data(dataid)) log.log( 24, " @@8:Queueing @@1: %s from %s" % (j, task)) if execution: job_queue.put([ j.jobid, j.cores, cmd, j.status_file ]) BUG.add(j.jobid) update_task_states_recursively(task) db.commit() checked_tasks.add(task.taskid) except TaskError as e: log.error("Errors found in %s" % task) import traceback traceback.print_exc() if GLOBALS["email"]: threadname = GLOBALS[task.configid]["_name"] send_mail( GLOBALS["email"], "Errors found in %s!" % threadname, '\n'.join(map(str, [task, e.value, e.msg]))) pending_tasks.discard(task) thread_errors[task.configid].append( [task, e.value, e.msg]) continue else: # Set temporary Queued state to avoids launching # jobs from clones task.status = "Q" if log.level < 24: show_task_info(task) if task.status == "D": #db.commit() show_task_info(task) logindent(3) # Log commands of every task if 'cmd_log_file' not in GLOBALS[task.configid]: GLOBALS[task.configid]['cmd_log_file'] = pjoin( GLOBALS[task.configid]["_outpath"], "cmd.log") O = open(GLOBALS[task.configid]['cmd_log_file'], "w") O.close() cmd_lines = get_cmd_log(task) CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a") print(task, file=CMD_LOG) for c in cmd_lines: print(' ' + '\t'.join(map(str, c)), file=CMD_LOG) CMD_LOG.close() # try: #wkname = GLOBALS[task.configid]['_name'] create_tasks = workflow_task_processor( task, task.target_wkname) except TaskError as e: log.error("Errors found in %s" % task) pending_tasks.discard(task) thread_errors[task.configid].append( [task, e.value, e.msg]) continue else: logindent(-3) to_add_tasks.update(create_tasks) pending_tasks.discard(task) elif task.status == "E": log.error("task contains errors: %s " % task) log.error("Errors found in %s") pending_tasks.discard(task) thread_errors[task.configid].append( [task, None, "Found (E) task status"]) #db.commit() #if not back_launcher: # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # Update global task list with recently added jobs to be check # during next cycle pending_tasks.update(to_add_tasks) ## END CHECK AND UPDATE CURRENT TASKS ## ================================ if wtime: set_logindent(0) log.log(28, "@@13:Waiting %s seconds@@1:" % wtime) sleep(wtime) else: sleep(schedule_time) # Dump / show ended threads error_lines = [] for configid, etasks in six.iteritems(thread_errors): error_lines.append("Thread @@10:%s@@1: contains errors:" %\ (GLOBALS[configid]["_name"])) for error in etasks: error_lines.append(" ** %s" % error[0]) e_obj = error[1] if error[1] else error[0] error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid if e_obj is not error[0]: error_lines.append(" -> %s" % e_obj) error_lines.append(" -> %s" % error_path) error_lines.append(" -> %s" % error[2]) for eline in error_lines: log.error(eline) pending_threads = set([ts.configid for ts in pending_tasks]) finished_threads = expected_threads - (pending_threads | set(thread_errors.keys())) just_finished_lines = [] finished_lines = [] for configid in finished_threads: # configid is the the same as threadid in master tasks final_tree_file = pjoin(GLOBALS[configid]["_outpath"], GLOBALS["inputname"] + ".final_tree") threadname = GLOBALS[configid]["_name"] if configid in past_threads: log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) finished_lines.append("Finished %s in %d iteration(s)" % (threadname, past_threads[configid])) else: log.log(28, "Assembling final tree...") main_tree, treeiters = assembly_tree(configid) past_threads[configid] = treeiters - 1 log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) log.log( 28, "Writing final tree for @@13:%s@@1:\n %s\n %s", threadname, final_tree_file + ".nw", final_tree_file + ".nwx (newick extended)") main_tree.write(outfile=final_tree_file + ".nw") main_tree.write(outfile=final_tree_file + ".nwx", features=[], format_root_node=True) if hasattr(main_tree, "alg_path"): log.log( 28, "Writing root node alignment @@13:%s@@1:\n %s", threadname, final_tree_file + ".fa") alg = SeqGroup(get_stored_data(main_tree.alg_path)) OUT = open(final_tree_file + ".fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print(">%s\n%s" % (realname, seq), file=OUT) OUT.close() if hasattr(main_tree, "clean_alg_path"): log.log( 28, "Writing root node trimmed alignment @@13:%s@@1:\n %s", threadname, final_tree_file + ".trimmed.fa") alg = SeqGroup( get_stored_data(main_tree.clean_alg_path)) OUT = open(final_tree_file + ".trimmed.fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print(">%s\n%s" % (realname, seq), file=OUT) OUT.close() if norender == False: log.log( 28, "Generating tree image for @@13:%s@@1:\n %s", threadname, final_tree_file + ".png") for lf in main_tree: lf.add_feature("sequence", alg.get_seq(lf.safename)) try: from ete3.tools.phylobuild_lib.visualize import draw_tree draw_tree(main_tree, GLOBALS[configid], final_tree_file + ".png") except Exception as e: log.warning( '@@8:something went wrong when generating the tree image. Try manually :(@@1:' ) if DEBUG: import traceback, sys traceback.print_exc(file=sys.stdout) just_finished_lines.append( "Finished %s in %d iteration(s)" % (threadname, past_threads[configid])) if GLOBALS["email"]: if not pending_tasks: all_lines = finished_lines + just_finished_lines + error_lines send_mail(GLOBALS["email"], "Your NPR process has ended", '\n'.join(all_lines)) elif GLOBALS["email_report_time"] and time() - last_report_time >= \ GLOBALS["email_report_time"]: all_lines = info_lines + error_lines + just_finished_lines send_mail(GLOBALS["email"], "Your NPR report", '\n'.join(all_lines)) last_report_time = time() elif just_finished_lines: send_mail(GLOBALS["email"], "Finished threads!", '\n'.join(just_finished_lines)) log.log(26, "") except: raise if thread_errors: log.error("Done with ERRORS") else: log.log(28, "Done") return thread_errors
def finish(self): alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "alg.fasta")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgTask.store_data(self, fasta, phylip)
def split_tree(task_tree_node, task_outgroups, main_tree, alg_path, npr_conf, threadid, target_cladeids): """Browses a task tree from root to leaves and yields next suitable nodes for NPR iterations. Each yielded node comes with the set of target and outgroup tips. """ def processable_node(_n): """This an internal function that returns true if a given node is suitable for a NPR iteration. It can be used as "is_leaf_fn" when traversing a tree. Note that this function uses several variables which change within the split_tree function, so must be kept within its namespace. """ is_leaf = False for wkname, wkfilter in npr_conf.npr_workflows: # if node is not in the targets or does not meet size filters, skip # workflow if _n is master_node or \ (_TARGET_NODES and _n not in _TARGET_NODES) or \ (target_cladeids and _n.cladeid not in target_cladeids) or \ len(n2content[_n]) < max(wkfilter.get("min_size", 3), 3) or \ ("max_size" in wkfilter and len(n2content[_n]) > wkfilter["max_size"]): continue # If seq_sim filter used, calculate node stats if ALG and ("min_seq_sim" in wkfilter or "max_seq_sim" in wkfilter): if not hasattr(_n, "seqs_mean_ident"): log.log(20, "Calculating node sequence stats...") mx, mn, avg, std = get_seqs_identity( ALG, [__n.name for __n in n2content[_n]]) _n.add_features(seqs_max_ident=mx, seqs_min_ident=mn, seqs_mean_ident=avg, seqs_std_ident=std) log.log( 20, "mx=%s, mn=%s, avg=%s, std=%s" % (mx, mn, avg, std)) if _n.seqs_mean_ident < wkfilter["min_seq_sim"]: continue if _n.seqs_mean_ident > wkfilter["max_seq_sim"]: continue else: _n.add_features(seqs_max_ident=None, seqs_min_ident=None, seqs_mean_ident=None, seqs_std_ident=None) if "min_support" in wkfilter: # If we are optimizing only lowly supported nodes, and nodes are # optimized without an outgroup, our target node is actually the # parent of lowly supported nodes. Therefore, I check if support # is low in children nodes, and return this node if so. if not npr_conf.use_outgroup: if not [ _ch for _ch in _n.children if _ch.support <= wkfilter["min_support"] ]: continue # Otherwise, just skip the node if it above the min support elif _n.support > wkfilter["min_support"]: continue # At this point, node passed all filters of this workflow were met, # so it can be optimized is_leaf = True _n._target_wkname = wkname break return is_leaf log.log(20, "Loading tree content...") n2content = main_tree.get_cached_content() if alg_path: log.log(20, "Loading associated alignment to check seq. similarity") raw_alg = db.get_task_data(*alg_path.split(".")) ALG = SeqGroup(raw_alg) else: ALG = None log.log(20, "Finding next NPR nodes...") # task_tree_node is actually a node in main_tree, since it has been # already merged trees_to_browse = [task_tree_node] npr_nodes = 0 # loads current tree content, so we can check not reconstructing exactly the # same tree tasktree_content = set([leaf.name for leaf in n2content[task_tree_node] ]) | set(task_outgroups) while trees_to_browse: master_node = trees_to_browse.pop() # if custom taxa levels are defined as targets, find them in this # subtree _TARGET_NODES = defaultdict(list) # this container is used by # processable_node function opt_levels = GLOBALS[threadid].get('_optimized_levels', None) if opt_levels is not None: # any descendant of the already processed node is suitable for # selection. If the ancestor of level-species is on top of the # task_tree_node, it will be discarded avail_nodes = set(master_node.get_descendants()) for lin in opt_levels: sp2lin, lin2sp = GLOBALS["lineages"] optimized, strict_monophyly = opt_levels[lin] if not optimized: ancestor = main_tree.get_common_ancestor(*lin2sp[lin]) if ancestor in avail_nodes: # check that the node satisfies level monophyly config ancestor_content = set( [x.name for x in n2content[ancestor]]) if not strict_monophyly or lin2sp[ lin] == ancestor_content: _TARGET_NODES[ancestor].append(lin) elif strict_monophyly: log.log( 26, "Discarding not monophyletic level @@11:%s@@1:" % lin) else: log.log(26, "Discarding upper clade @@11:%s@@1:" % lin) for node in master_node.iter_leaves(is_leaf_fn=processable_node): if opt_levels: log.log( 28, "Trying to optimizing custom tree level: @@11:%s@@1:" % _TARGET_NODES[node]) for lin in _TARGET_NODES[node]: # Marks the level as optimized, so is not computed again opt_levels[lin][0] = True log.log( 28, "Found possible target node of size %s branch support %f" % (len(n2content[node]), node.support)) log.log(28, "First suitable workflow: %s" % (node._target_wkname)) # Finds best outgroup for the target node if npr_conf.use_outgroup: splitterconfname, _ = npr_conf.tree_splitter splitterconf = GLOBALS[threadid][splitterconfname] #seqs, outs = select_outgroups(node, n2content, splitterconf) #seqs, outs = select_closest_outgroup(node, n2content, splitterconf) seqs, outs = select_sister_outgroup(node, n2content, splitterconf) else: seqs = set([_i.name for _i in n2content[node]]) outs = set() if seqs | outs == tasktree_content: log.log( 26, "Discarding target node of size %s, due to identity with its parent node" % len(n2content[node])) #print tasktree_content #print seqs #print outs trees_to_browse.append(node) else: npr_nodes += 1 yield node, seqs, outs, node._target_wkname log.log(28, "%s nodes will be optimized", npr_nodes)
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender): # Adjust debug mode if debug == "all": log.setLevel(10) pending_tasks = set(pending_tasks) ## =================================== ## INITIALIZE BASIC VARS execution, run_detached = execution thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) expected_threads = set(thread2tasks.keys()) past_threads = {} thread_errors = defaultdict(list) ## END OF VARS AND SHORTCUTS ## =================================== cores_total = GLOBALS["_max_cores"] if cores_total > 0: job_queue = Queue() back_launcher = Process(target=background_job_launcher, args=(job_queue, run_detached, GLOBALS["launch_time"], cores_total)) back_launcher.start() else: job_queue = None back_launcher = None GLOBALS["_background_scheduler"] = back_launcher GLOBALS["_job_queue"] = job_queue # Captures Ctrl-C for debuging DEBUG #signal.signal(signal.SIGINT, control_c) last_report_time = None BUG = set() try: # Enters into task scheduling while pending_tasks: wtime = schedule_time # ask SGE for running jobs if execution == "sge": #sgeid2jobs = db.get_sge_tasks() #qstat_jobs = sge.qstat() pass else: qstat_jobs = None # Show summary of pending tasks per thread thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) set_logindent(0) log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime())) info_lines = [] for tid, tlist in six.iteritems(thread2tasks): threadname = GLOBALS[tid]["_name"] sizelist = ["%s" %getattr(_ts, "size", "?") for _ts in tlist] info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" %( threadname, len(tlist), ', '.join(sizelist)) info_lines.append(info) for line in info_lines: log.log(28, line) if GLOBALS["email"] and last_report_time is None: last_report_time = time() send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines)) ## ================================ ## CHECK AND UPDATE CURRENT TASKS checked_tasks = set() check_start_time = time() to_add_tasks = set() GLOBALS["cached_status"] = {} for task in sorted(pending_tasks, sort_tasks): # Avoids endless periods without new job submissions elapsed_time = time() - check_start_time #if not back_launcher and pending_tasks and \ # elapsed_time > schedule_time * 2: # log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:") # db.commit() # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # check_start_time = time() # Enter debuging mode if necessary if debug and log.level > 10 and task.taskid.startswith(debug): log.setLevel(10) log.debug("ENTERING IN DEBUGGING MODE") thread2tasks[task.configid].append(task) # Update tasks and job statuses if task.taskid not in checked_tasks: try: show_task_info(task) task.status = task.get_status(qstat_jobs) db.dataconn.commit() if back_launcher and task.status not in set("DE"): for j, cmd in task.iter_waiting_jobs(): j.status = "Q" GLOBALS["cached_status"][j.jobid] = "Q" if j.jobid not in BUG: if not os.path.exists(j.jobdir): os.makedirs(j.jobdir) for ifile, outpath in six.iteritems(j.input_files): try: _tid, _did = ifile.split(".") _did = int(_did) except (IndexError, ValueError): dataid = ifile else: dataid = db.get_dataid(_tid, _did) if not outpath: outfile = pjoin(GLOBALS["input_dir"], ifile) else: outfile = pjoin(outpath, ifile) if not os.path.exists(outfile): open(outfile, "w").write(db.get_data(dataid)) log.log(24, " @@8:Queueing @@1: %s from %s" %(j, task)) if execution: job_queue.put([j.jobid, j.cores, cmd, j.status_file]) BUG.add(j.jobid) update_task_states_recursively(task) db.commit() checked_tasks.add(task.taskid) except TaskError as e: log.error("Errors found in %s" %task) import traceback traceback.print_exc() if GLOBALS["email"]: threadname = GLOBALS[task.configid]["_name"] send_mail(GLOBALS["email"], "Errors found in %s!" %threadname, '\n'.join(map(str, [task, e.value, e.msg]))) pending_tasks.discard(task) thread_errors[task.configid].append([task, e.value, e.msg]) continue else: # Set temporary Queued state to avoids launching # jobs from clones task.status = "Q" if log.level < 24: show_task_info(task) if task.status == "D": #db.commit() show_task_info(task) logindent(3) # Log commands of every task if 'cmd_log_file' not in GLOBALS[task.configid]: GLOBALS[task.configid]['cmd_log_file'] = pjoin(GLOBALS[task.configid]["_outpath"], "cmd.log") O = open(GLOBALS[task.configid]['cmd_log_file'], "w") O.close() cmd_lines = get_cmd_log(task) CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a") print(task, file=CMD_LOG) for c in cmd_lines: print(' '+'\t'.join(map(str, c)), file=CMD_LOG) CMD_LOG.close() # try: #wkname = GLOBALS[task.configid]['_name'] create_tasks = workflow_task_processor(task, task.target_wkname) except TaskError as e: log.error("Errors found in %s" %task) pending_tasks.discard(task) thread_errors[task.configid].append([task, e.value, e.msg]) continue else: logindent(-3) to_add_tasks.update(create_tasks) pending_tasks.discard(task) elif task.status == "E": log.error("task contains errors: %s " %task) log.error("Errors found in %s") pending_tasks.discard(task) thread_errors[task.configid].append([task, None, "Found (E) task status"]) #db.commit() #if not back_launcher: # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # Update global task list with recently added jobs to be check # during next cycle pending_tasks.update(to_add_tasks) ## END CHECK AND UPDATE CURRENT TASKS ## ================================ if wtime: set_logindent(0) log.log(28, "@@13:Waiting %s seconds@@1:" %wtime) sleep(wtime) else: sleep(schedule_time) # Dump / show ended threads error_lines = [] for configid, etasks in six.iteritems(thread_errors): error_lines.append("Thread @@10:%s@@1: contains errors:" %\ (GLOBALS[configid]["_name"])) for error in etasks: error_lines.append(" ** %s" %error[0]) e_obj = error[1] if error[1] else error[0] error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid if e_obj is not error[0]: error_lines.append(" -> %s" %e_obj) error_lines.append(" -> %s" %error_path) error_lines.append(" -> %s" %error[2]) for eline in error_lines: log.error(eline) pending_threads = set([ts.configid for ts in pending_tasks]) finished_threads = expected_threads - (pending_threads | set(thread_errors.keys())) just_finished_lines = [] finished_lines = [] for configid in finished_threads: # configid is the the same as threadid in master tasks final_tree_file = pjoin(GLOBALS[configid]["_outpath"], GLOBALS["inputname"] + ".final_tree") threadname = GLOBALS[configid]["_name"] if configid in past_threads: log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) finished_lines.append("Finished %s in %d iteration(s)" %( threadname, past_threads[configid])) else: log.log(28, "Assembling final tree...") main_tree, treeiters = assembly_tree(configid) past_threads[configid] = treeiters - 1 log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) log.log(28, "Writing final tree for @@13:%s@@1:\n %s\n %s", threadname, final_tree_file+".nw", final_tree_file+".nwx (newick extended)") main_tree.write(outfile=final_tree_file+".nw") main_tree.write(outfile=final_tree_file+ ".nwx", features=[], format_root_node=True) if hasattr(main_tree, "alg_path"): log.log(28, "Writing root node alignment @@13:%s@@1:\n %s", threadname, final_tree_file+".fa") alg = SeqGroup(get_stored_data(main_tree.alg_path)) OUT = open(final_tree_file+".fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print(">%s\n%s" %(realname, seq), file=OUT) OUT.close() if hasattr(main_tree, "clean_alg_path"): log.log(28, "Writing root node trimmed alignment @@13:%s@@1:\n %s", threadname, final_tree_file+".trimmed.fa") alg = SeqGroup(get_stored_data(main_tree.clean_alg_path)) OUT = open(final_tree_file+".trimmed.fa", "w") for name, seq, comments in alg: realname = db.get_seq_name(name) print(">%s\n%s" %(realname, seq), file=OUT) OUT.close() if norender == False: log.log(28, "Generating tree image for @@13:%s@@1:\n %s", threadname, final_tree_file+".png") for lf in main_tree: lf.add_feature("sequence", alg.get_seq(lf.safename)) try: from ete3.tools.phylobuild_lib.visualize import draw_tree draw_tree(main_tree, GLOBALS[configid], final_tree_file+".png") except Exception as e: log.warning('@@8:something went wrong when generating the tree image. Try manually :(@@1:') if DEBUG: import traceback, sys traceback.print_exc(file=sys.stdout) just_finished_lines.append("Finished %s in %d iteration(s)" %( threadname, past_threads[configid])) if GLOBALS["email"]: if not pending_tasks: all_lines = finished_lines + just_finished_lines + error_lines send_mail(GLOBALS["email"], "Your NPR process has ended", '\n'.join(all_lines)) elif GLOBALS["email_report_time"] and time() - last_report_time >= \ GLOBALS["email_report_time"]: all_lines = info_lines + error_lines + just_finished_lines send_mail(GLOBALS["email"], "Your NPR report", '\n'.join(all_lines)) last_report_time = time() elif just_finished_lines: send_mail(GLOBALS["email"], "Finished threads!", '\n'.join(just_finished_lines)) log.log(26, "") except: raise if thread_errors: log.error("Done with ERRORS") else: log.log(28, "Done") return thread_errors
def get_concatenated_alg(alg_filenames, models=None, sp_field=0, sp_delimiter="_", kill_thr=0.0, keep_species=set()): # Concat alg container concat = SeqGroup() # Used to store different model partitions concat.id2partition = {} if not models: models = ["None"]*len(alg_filenames) else: if len(models) != len(alg_filenames): raise ValueError("Different number of algs and model names was found!") expected_total_length = 0 # Check algs and gets the whole set of species alg_objects = [] sp2alg = defaultdict(list) for algfile, matrix in zip(alg_filenames, models): alg = SeqGroup(algfile, "fasta") alg_objects.append(alg) lenseq = None browsed_species = set() alg.sp2seq = {} # Set best matrix for this alignment alg.matrix = matrix # Change seq names to contain only species names for i, seq in six.iteritems(alg.id2seq): name = db.get_seq_name(alg.id2name[i]) taxid = get_species_code(name, splitter=sp_delimiter, field=sp_field) if lenseq is not None and len(seq) != lenseq: raise Exception("Inconsistent alignment when concatenating: Unequal length") elif lenseq is None: lenseq = len(seq) alg.seqlength = len(seq) expected_total_length += len(seq) if taxid in browsed_species: raise Exception("Inconsistent alignment when concatenating: Repeated species") browsed_species.add(taxid) # Check no duplicated species in the same alg sp2alg[taxid].append(alg) # Records all species seen in all algs. alg.sp2seq[taxid] = seq valid_species = [sp for sp in six.iterkeys(sp2alg) \ if sp in keep_species or \ len(sp2alg[sp])/float(len(alg_objects)) > kill_thr] log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\ (len(valid_species), len(sp2alg), kill_thr, len(keep_species))) def sort_single_algs(alg1, alg2): r = cmp(alg1.matrix, alg2.matrix) if r == 0: return cmp(sorted(alg1.id2name.values()), sorted(alg2.id2name.values())) else: return r sorted_algs = sorted(alg_objects, sort_single_algs) concat_alg_lengths = [alg.seqlength for alg in sorted_algs] model2win = {} model2size = {} for alg in sorted_algs: model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength # Create concat alg concat.id2seq = defaultdict(list) for sp in sorted(valid_species): log.log(20, "Concatenating sequences of [%s]" %sp) for alg in sorted_algs: seq = alg.sp2seq.get(sp, "-" * alg.seqlength) concat.id2seq[sp].append(seq) #current_seq = concat.id2seq.get(sp, "") #concat.id2seq[sp] = current_seq + seq.strip() concat.id2name[sp] = sp concat.name2id[sp] = sp concat.id2comment[sp] = [""] concat.id2seq[sp] = ''.join(concat.id2seq[sp]) current_pos = 0 partitions = [] for model in sorted(model2size.keys()): size = model2size[model] part = "%s, %s = %d-%d" % (model, model+"_genes", \ current_pos + 1,\ current_pos + size) current_pos += size partitions.append(part) # Basic Checks seq_sizes = [len(seq) for seq in list(concat.id2seq.values())] if len(set(seq_sizes)) != 1: raise Exception("Concatenated alignment is not consistent: unequal seq length ") if seq_sizes[0] != expected_total_length: raise Exception("The size of concatenated alg is not what expected") return concat, partitions, sp2alg, valid_species, concat_alg_lengths
def get_concatenated_alg(alg_filenames, models=None, sp_field=0, sp_delimiter="_", kill_thr=0.0, keep_species=set()): # Concat alg container concat = SeqGroup() # Used to store different model partitions concat.id2partition = {} if not models: models = ["None"] * len(alg_filenames) else: if len(models) != len(alg_filenames): raise ValueError( "Different number of algs and model names was found!") expected_total_length = 0 # Check algs and gets the whole set of species alg_objects = [] sp2alg = defaultdict(list) for algfile, matrix in zip(alg_filenames, models): alg = SeqGroup(algfile, "fasta") alg_objects.append(alg) lenseq = None browsed_species = set() alg.sp2seq = {} # Set best matrix for this alignment alg.matrix = matrix # Change seq names to contain only species names for i, seq in six.iteritems(alg.id2seq): name = db.get_seq_name(alg.id2name[i]) taxid = get_species_code(name, splitter=sp_delimiter, field=sp_field) if lenseq is not None and len(seq) != lenseq: raise Exception( "Inconsistent alignment when concatenating: Unequal length" ) elif lenseq is None: lenseq = len(seq) alg.seqlength = len(seq) expected_total_length += len(seq) if taxid in browsed_species: raise Exception( "Inconsistent alignment when concatenating: Repeated species" ) browsed_species.add( taxid) # Check no duplicated species in the same alg sp2alg[taxid].append(alg) # Records all species seen in all algs. alg.sp2seq[taxid] = seq valid_species = [sp for sp in six.iterkeys(sp2alg) \ if sp in keep_species or \ len(sp2alg[sp])/float(len(alg_objects)) > kill_thr] log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\ (len(valid_species), len(sp2alg), kill_thr, len(keep_species))) def sort_single_algs(alg1, alg2): r = cmp(alg1.matrix, alg2.matrix) if r == 0: return cmp(sorted(alg1.id2name.values()), sorted(alg2.id2name.values())) else: return r sorted_algs = sorted(alg_objects, sort_single_algs) concat_alg_lengths = [alg.seqlength for alg in sorted_algs] model2win = {} model2size = {} for alg in sorted_algs: model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength # Create concat alg concat.id2seq = defaultdict(list) for sp in sorted(valid_species): log.log(20, "Concatenating sequences of [%s]" % sp) for alg in sorted_algs: seq = alg.sp2seq.get(sp, "-" * alg.seqlength) concat.id2seq[sp].append(seq) #current_seq = concat.id2seq.get(sp, "") #concat.id2seq[sp] = current_seq + seq.strip() concat.id2name[sp] = sp concat.name2id[sp] = sp concat.id2comment[sp] = [""] concat.id2seq[sp] = ''.join(concat.id2seq[sp]) current_pos = 0 partitions = [] for model in sorted(model2size.keys()): size = model2size[model] part = "%s, %s = %d-%d" % (model, model+"_genes", \ current_pos + 1,\ current_pos + size) current_pos += size partitions.append(part) # Basic Checks seq_sizes = [len(seq) for seq in list(concat.id2seq.values())] if len(set(seq_sizes)) != 1: raise Exception( "Concatenated alignment is not consistent: unequal seq length ") if seq_sizes[0] != expected_total_length: raise Exception("The size of concatenated alg is not what expected") return concat, partitions, sp2alg, valid_species, concat_alg_lengths