def get_stored_data(fileid): try: _tid, _did = fileid.split(".") _did = int(_did) except (IndexError, ValueError): dataid = fileid else: dataid = db.get_dataid(_tid, _did) return db.get_data(dataid)
def finish(self): def euc_dist(x, y): return len(x.symmetric_difference(y)) / float((len(x) + len(y))) dataid = db.get_dataid(*self.task_tree_file.split(".")) ttree = PhyloTree(db.get_data(dataid)) mtree = self.main_tree ttree.dist = 0 cladeid, target_seqs, out_seqs = db.get_node_info(self.threadid, self.nodeid) self.out_seqs = out_seqs self.target_seqs = target_seqs ttree_content = ttree.get_cached_content() if mtree and not out_seqs: mtree_content = mtree.get_cached_content() log.log(24, "Finding best scoring outgroup from previous iteration.") for _n in mtree_content: if _n.cladeid == cladeid: orig_target = _n target_left = set([_n.name for _n in mtree_content[orig_target.children[0]]]) target_right = set([_n.name for _n in mtree_content[orig_target.children[1]]]) partition_pairs = [] everything = set([_n.name for _n in ttree_content[ttree]]) for n, content in ttree_content.iteritems(): if n is ttree: continue left = set([_n.name for _n in content]) right = everything - left d1 = euc_dist(left, target_left) d2 = euc_dist(left, target_right) best_match = min(d1, d2) partition_pairs.append([best_match, left, right, n]) partition_pairs.sort() self.outgroup_match_dist = partition_pairs[0][0] #self.outgroup_match = '#'.join( ['|'.join(partition_pairs[0][1]), # '|'.join(partition_pairs[0][2])] ) outgroup = partition_pairs[0][3] ttree.set_outgroup(outgroup) ttree.dist = orig_target.dist ttree.support = orig_target.support # Merge task and main trees parent = orig_target.up orig_target.detach() parent.add_child(ttree) elif mtree and out_seqs: log.log(26, "Rooting tree using %d custom seqs" % len(out_seqs)) self.outgroup_match = '|'.join(out_seqs) #log.log(22, "Out seqs: %s", len(out_seqs)) #log.log(22, "Target seqs: %s", target_seqs) if len(out_seqs) > 1: #first root to a single seqs outside the outgroup #(should never fail and avoids random outgroup split #problems in unrooted trees) ttree.set_outgroup(ttree & list(target_seqs)[0]) # Now tries to get the outgroup node as a monophyletic clade outgroup = ttree.get_common_ancestor(out_seqs) if set(outgroup.get_leaf_names()) ^ out_seqs: msg = "Monophyly of the selected outgroup could not be granted! Probably constrain tree failed." #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, out_seqs) raise TaskError(self, msg) else: outgroup = ttree & list(out_seqs)[0] ttree.set_outgroup(outgroup) orig_target = self.main_tree.get_common_ancestor(target_seqs) found_target = outgroup.get_sisters()[0] ttree = ttree.get_common_ancestor(target_seqs) outgroup.detach() self.pre_iter_support = orig_target.support # Use previous dist and support ttree.dist = orig_target.dist ttree.support = orig_target.support parent = orig_target.up orig_target.detach() parent.add_child(ttree) else: # ROOTS FIRST ITERATION log.log(24, "Getting outgroup for first NPR split") # if early split is provided in the command line, it # overrides config file mainout = GLOBALS.get("first_split_outgroup", "midpoint") if mainout.lower() == "midpoint": log.log(26, "Rooting to midpoint.") best_outgroup = ttree.get_midpoint_outgroup() if best_outgroup: ttree.set_outgroup(best_outgroup) else: log.warning("Midpoint outgroup could not be set!") ttree.set_outgroup(ttree.iter_leaves().next()) else: if mainout.startswith("~"): # Lazy defined outgroup. Will trust in the common # ancestor of two or more OTUs strict_common_ancestor = False outs = set(mainout[1:].split()) if len(outs) < 2: raise TaskError(self, "First split outgroup error: common " "ancestor calculation requires at least two OTU names") else: strict_common_ancestor = True outs = set(mainout.split()) if outs - target_seqs: raise TaskError(self, "Unknown seqs cannot be used to set first split rooting:%s" %(outs - target_seqs)) if len(outs) > 1: anchor = list(set(target_seqs) - outs)[0] ttree.set_outgroup(ttree & anchor) common = ttree.get_common_ancestor(outs) out_seqs = common.get_leaf_names() if common is ttree: msg = "First split outgroup could not be granted:%s" %out_seqs #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs) raise TaskError(self, msg) if strict_common_ancestor and set(out_seqs) ^ outs: msg = "Monophyly of first split outgroup could not be granted:%s" %out_seqs #dump_tree_debug(msg, self.taskdir, mtree, ttree, target_seqs, outs) raise TaskError(self, msg) log.log(26, "@@8:First split rooting to %d seqs@@1:: %s" %(len(out_seqs),out_seqs)) ttree.set_outgroup(common) else: single_out = outs.pop() common = ttree.set_outgroup(single_out) log.log(26, "@@8:First split rooting to 1 seq@@1:: %s" %(single_out)) self.main_tree = ttree orig_target = ttree tn = orig_target.copy() self.pre_iter_task_tree = tn self.rf = orig_target.robinson_foulds(ttree) self.pre_iter_support = orig_target.support # Reloads node2content of the rooted tree and generate cladeids ttree_content = self.main_tree.get_cached_content() for n, content in ttree_content.iteritems(): cid = generate_id([_n.name for _n in content]) n.add_feature("cladeid", cid) #ttree.write(outfile=self.pruned_tree) self.task_tree = ttree
def finish(self): # Assumes tasks resulting from genetree workflow, in which # only Alg and Acleaner tasks could contain the results log.log(26, "Collecting supermatrix data") # jobtypes = set() # job2alg, job2acleaner = {}, {} # for job in self.jobs: # jobtypes.add(job.ttype) # if job.ttype == "alg" and job.nodeid not in self.job2alg: # dataid = db.get_dataid(*job.alg_fasta_file.split(".")) # job2alg[job.nodeid] = db.get_data(dataid) # elif job.ttype == "acleaner": # a, b = job.clean_alg_fasta_file.split(".") # dataid = db.get_dataid(*job.clean_alg_fasta_file.split(".")) # job2acleaner[job.nodeid] = db.get_data(dataid) # elif job.ttype == "mchooser": # self.job2model[job.nodeid] = job.best_model # Let's extract alignments from the tree job in the genetree workflow, # so I can make sure I am using the correct version, either raw, trimmed version, or even the # switched nt version observed_seqtypes = set() self.job2alg = {} for job in self.jobs: if job.ttype == "tree": observed_seqtypes.add(job.seqtype) taskid, datatype = job.alg_phylip_file.split(".") dataid = db.get_dataid(taskid, datatype) self.job2alg[job.nodeid] = db.get_data(dataid) elif job.ttype == "mchooser": self.job2model[job.nodeid] = job.best_model # if all alignments are nt, then set it as seqtype for concat alg if len(observed_seqtypes) > 1: raise TaskError('Mixed data types not supported in super-matrix workflow') elif "nt" in observed_seqtypes: self.seqtype = "nt" self.default_model = self.conf[self.confname]["_default_nt_model"] else: self.seqtype = "aa" self.default_model = self.conf[self.confname]["_default_aa_model"] if self.cog_ids - set(self.job2alg): log.error("Missing %s algs", len(self.cog_ids - set(self.job2alg))) missing = self.cog_ids - set(self.job2alg) raise TaskError(self, "Missing algs (%d): i.e. %s" %(len(missing),missing[:10])) alg_data = [(self.job2alg[nid], self.job2model.get(nid, self.default_model)) for nid in self.job2alg] filenames, models = zip(*alg_data) mainalg, partitions, sp2alg, species, alg_lenghts = get_concatenated_alg( filenames, models, sp_field=0, sp_delimiter=GLOBALS["spname_delimiter"]) log.log(20, "Done concat alg, now writting fasta format") fasta = mainalg.write(format="fasta") log.log(20, "Done concat alg, now writting phylip format") phylip = mainalg.write(format="iphylip_relaxed") txt_partitions = '\n'.join(partitions) log.log(26, "Modeled regions: \n"+'\n'.join(partitions)) ConcatAlg.store_data(self, fasta, phylip, txt_partitions)
def process_task(task, wkname, npr_conf, nodeid2info): alignerconf, alignerclass = npr_conf.aligner cleanerconf, cleanerclass = npr_conf.alg_cleaner mtesterconf, mtesterclass = npr_conf.model_tester treebuilderconf, treebuilderclass = npr_conf.tree_builder if not treebuilderclass: # Allows to dump algs in workflows with no tree tasks treebuilderclass = DummyTree splitterconf, splitterclass = npr_conf.tree_splitter conf = GLOBALS[task.configid] seqtype = task.seqtype nodeid = task.nodeid ttype = task.ttype taskid = task.taskid threadid = task.threadid node_info = nodeid2info[nodeid] size = task.size#node_info.get("size", 0) target_seqs = node_info.get("target_seqs", []) out_seqs = node_info.get("out_seqs", []) if not treebuilderclass or size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree mtesterclass = None # If more than one outgroup are used, enable the use of constrain if out_seqs and len(out_seqs) > 1: constrain_id = nodeid else: constrain_id = None new_tasks = [] if ttype == "msf": # Register Tree constrains constrain_tree = "(%s, (%s));" %(','.join(sorted(task.out_seqs)), ','.join(sorted(task.target_seqs))) _outs = "\n".join(map(lambda name: ">%s\n0" %name, sorted(task.out_seqs))) _tars = "\n".join(map(lambda name: ">%s\n1" %name, sorted(task.target_seqs))) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task # objects may require this info, I need # to commit right now. # Register node db.add_node(task.threadid, task.nodeid, task.cladeid, task.target_seqs, task.out_seqs) nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = task.target_seqs nodeid2info[nodeid]["out_seqs"] = task.out_seqs alg_task = alignerclass(nodeid, task.multiseq_file, seqtype, conf, alignerconf) alg_task.size = task.size new_tasks.append(alg_task) elif ttype == "alg" or ttype == "acleaner": if ttype == "alg": nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file elif ttype == "acleaner": nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file alg_fasta_file = getattr(task, "clean_alg_fasta_file", task.alg_fasta_file) alg_phylip_file = getattr(task, "clean_alg_phylip_file", task.alg_phylip_file) # Calculate alignment stats # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file, # conf["app"]["trimal"]) # # max_identity = get_trimal_identity(task.alg_fasta_file, # conf["app"]["trimal"]) # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std) # log.info("Max. Identity: %0.2f", max_identity) #import time #t1 = time.time() #mx, mn, mean, std = get_identity(task.alg_fasta_file) #print time.time()-t1 #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) #t1 = time.time() if seqtype == "aa" and npr_conf.switch_aa_similarity < 1: try: alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats) except Exception, e: alg_stats = {} if ttype == "alg": algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) dataid = DATATYPES.alg_phylip elif ttype == "acleaner": algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file) dataid = DATATYPES.clean_alg_phylip if "i_mean" not in alg_stats: log.log(24, "Calculating alignment stats...") # dump data if necesary algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) if not pexist(algfile): # dump phylip alg open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid))) mx, mn, mean, std = get_statal_identity(algfile, conf["app"]["statal"]) alg_stats = {"i_max":mx, "i_mean":mean, "i_min":mn, "i_std":std} db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats) log.log(22, "Alignment stats (sequence similarity):") log.log(22, " max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" % (alg_stats)) else: alg_stats = {"i_max":-1, "i_mean":-1, "i_min":-1, "i_std":-1} #print time.time()-t1 #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) task.max_ident = alg_stats["i_max"] task.min_ident = alg_stats["i_min"] task.mean_ident = alg_stats["i_mean"] task.std_ident = alg_stats["i_std"] next_task = None if ttype == "alg" and cleanerclass: next_task = cleanerclass(nodeid, seqtype, alg_fasta_file, alg_phylip_file, conf, cleanerconf) else: # Converts aa alignment into nt if necessary if seqtype == "aa" and \ "nt" in GLOBALS["seqtypes"] and \ task.mean_ident >= npr_conf.switch_aa_similarity: log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\ (task.mean_ident, npr_conf.switch_aa_similarity)) alg_fasta_file = "%s.%s" %(taskid, DATATYPES.alg_nt_fasta) alg_phylip_file = "%s.%s" %(taskid, DATATYPES.alg_nt_phylip) try: alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta) alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip) except ValueError: log.log(22, "Calculating codon alignment...") source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file) if ttype == "alg": kept_columns = [] elif ttype == "acleaner": # if original alignment was trimmed, use it as reference # but make the nt alignment only on the kept columns kept_columns = db.get_task_data(taskid, DATATYPES.kept_alg_columns) if not pexist(source_alg): open(source_alg, "w").write(db.get_task_data(taskid, DATATYPES.alg_fasta)) nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns) db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write()) db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed')) npr_conf = IterConfig(conf, wkname, task.size, "nt") seqtype = "nt" if mtesterclass: next_task = mtesterclass(nodeid, alg_fasta_file, alg_phylip_file, constrain_id, conf, mtesterconf) elif treebuilderclass: next_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id, None, seqtype, conf, treebuilderconf) if next_task: next_task.size = task.size new_tasks.append(next_task)
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender): # Adjust debug mode if debug == "all": log.setLevel(10) pending_tasks = set(pending_tasks) ## =================================== ## INITIALIZE BASIC VARS execution, run_detached = execution thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) expected_threads = set(thread2tasks.keys()) past_threads = {} thread_errors = defaultdict(list) ## END OF VARS AND SHORTCUTS ## =================================== cores_total = GLOBALS["_max_cores"] if cores_total > 0: job_queue = Queue() back_launcher = Process(target=background_job_launcher, args=(job_queue, run_detached, GLOBALS["launch_time"], cores_total)) back_launcher.start() else: job_queue = None back_launcher = None GLOBALS["_background_scheduler"] = back_launcher GLOBALS["_job_queue"] = job_queue # Captures Ctrl-C for debuging DEBUG #signal.signal(signal.SIGINT, control_c) last_report_time = None BUG = set() try: # Enters into task scheduling while pending_tasks: wtime = schedule_time # ask SGE for running jobs if execution == "sge": sgeid2jobs = db.get_sge_tasks() qstat_jobs = sge.qstat() else: qstat_jobs = None # Show summary of pending tasks per thread thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) set_logindent(0) log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime())) info_lines = [] for tid, tlist in thread2tasks.iteritems(): threadname = GLOBALS[tid]["_name"] sizelist = ["%s" %getattr(_ts, "size", "?") for _ts in tlist] info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" %( threadname, len(tlist), ', '.join(sizelist)) info_lines.append(info) for line in info_lines: log.log(28, line) if GLOBALS["email"] and last_report_time is None: last_report_time = time() send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines)) ## ================================ ## CHECK AND UPDATE CURRENT TASKS checked_tasks = set() check_start_time = time() to_add_tasks = set() GLOBALS["cached_status"] = {} for task in sorted(pending_tasks, sort_tasks): # Avoids endless periods without new job submissions elapsed_time = time() - check_start_time #if not back_launcher and pending_tasks and \ # elapsed_time > schedule_time * 2: # log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:") # db.commit() # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # check_start_time = time() # Enter debuging mode if necessary if debug and log.level > 10 and task.taskid.startswith(debug): log.setLevel(10) log.debug("ENTERING IN DEBUGGING MODE") thread2tasks[task.configid].append(task) # Update tasks and job statuses if task.taskid not in checked_tasks: try: show_task_info(task) task.status = task.get_status(qstat_jobs) db.dataconn.commit() if back_launcher and task.status not in set("DE"): for j, cmd in task.iter_waiting_jobs(): j.status = "Q" GLOBALS["cached_status"][j.jobid] = "Q" if j.jobid not in BUG: if not os.path.exists(j.jobdir): os.makedirs(j.jobdir) for ifile, outpath in j.input_files.iteritems(): try: _tid, _did = ifile.split(".") _did = int(_did) except (IndexError, ValueError): dataid = ifile else: dataid = db.get_dataid(_tid, _did) if not outpath: outfile = pjoin(GLOBALS["input_dir"], ifile) else: outfile = pjoin(outpath, ifile) if not os.path.exists(outfile): open(outfile, "w").write(db.get_data(dataid)) log.log(24, " @@8:Queueing @@1: %s from %s" %(j, task)) job_queue.put([j.jobid, j.cores, cmd, j.status_file]) BUG.add(j.jobid) update_task_states_recursively(task) db.commit() checked_tasks.add(task.taskid) except TaskError, e: log.error("Errors found in %s" %task) import traceback traceback.print_exc() if GLOBALS["email"]: threadname = GLOBALS[task.configid]["_name"] send_mail(GLOBALS["email"], "Errors found in %s!" %threadname, '\n'.join(map(str, [task, e.value, e.msg]))) pending_tasks.discard(task) thread_errors[task.configid].append([task, e.value, e.msg]) continue else: # Set temporary Queued state to avoids launching # jobs from clones task.status = "Q" if log.level < 24: show_task_info(task) if task.status == "D": #db.commit() show_task_info(task) logindent(3) # Log commands of every task if 'cmd_log_file' not in GLOBALS[task.configid]: GLOBALS[task.configid]['cmd_log_file'] = pjoin(GLOBALS[task.configid]["_outpath"], "cmd.log") O = open(GLOBALS[task.configid]['cmd_log_file'], "w") O.close() cmd_lines = get_cmd_log(task) CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a") print >>CMD_LOG, task for c in cmd_lines: print >>CMD_LOG, ' '+'\t'.join(map(str, c)) CMD_LOG.close() # try: #wkname = GLOBALS[task.configid]['_name'] create_tasks = workflow_task_processor(task, task.target_wkname) except TaskError, e: log.error("Errors found in %s" %task) pending_tasks.discard(task) thread_errors[task.configid].append([task, e.value, e.msg]) continue else: logindent(-3) to_add_tasks.update(create_tasks) pending_tasks.discard(task) elif task.status == "E": log.error("task contains errors: %s " %task) log.error("Errors found in %s") pending_tasks.discard(task) thread_errors[task.configid].append([task, None, "Found (E) task status"])