def load_jobs(self): conf = self.conf for m in self.models: args = self.args.copy() args["--model"] = m bionj_job = Job(conf["app"]["phyml"], args, parent_ids=[self.nodeid]) bionj_job.jobname += "-bionj-" + m bionj_job.jobcat = "bionj" bionj_job.add_input_file(self.alg_phylip_file, bionj_job.jobdir) self.jobs.append(bionj_job) if self.lk_mode == "raxml": raxml_args = { "-f": "e", "-s": pjoin(bionj_job.jobdir, self.alg_phylip_file), "-m": "PROTGAMMA%s" % m, "-n": self.alg_phylip_file+"."+m, "-t": pjoin(bionj_job.jobdir, self.alg_phylip_file+"_phyml_tree.txt") } raxml_job = Job(conf["app"]["raxml"], raxml_args, parent_ids=[bionj_job.jobid]) raxml_job.jobname += "-lk-optimize" raxml_job.dependencies.add(bionj_job) raxml_job.model = m raxml_job.jobcat = "raxml" self.jobs.append(raxml_job)
def finish(self): lks = [] if self.lk_mode == "phyml": for job in self.jobs: if job.jobcat != "bionj": continue phyml_job = job tree_file = pjoin(phyml_job.jobdir, self.alg_phylip_file+"_phyml_tree.txt") stats_file = pjoin(phyml_job.jobdir, self.alg_phylip_file+"_phyml_stats.txt") tree = PhyloTree(tree_file) m = re.search('Log-likelihood:\s+(-?\d+\.\d+)', open(stats_file).read()) lk = float(m.groups()[0]) tree.add_feature("lk", lk) tree.add_feature("model", phyml_job.args["--model"]) lks.append([float(tree.lk), tree.model, tree]) elif self.lk_mode == "raxml": for job in self.jobs: if job.jobcat != "raxml": continue raxml_job = job lk = open(pjoin(raxml_job.jobdir, "RAxML_log.%s" %raxml_job.args["-n"])).readline().split()[1] tree = PhyloTree(raxml_job.args["-t"]) tree.add_feature("lk", lk) tree.add_feature("model", raxml_job.model) lks.append([float(tree.lk), tree.model, tree]) # sort lks in ASC order lks.sort() # choose the model with higher likelihood, the lastone in the list best_model = lks[-1][1] best_tree = lks[-1][2] log.log(22, "%s model selected from the following lk values:\n%s" %(best_model, '\n'.join(map(str, lks)))) ModelTesterTask.store_data(self, best_model, lks)
def app_wrapper(func, args): global NCURSES base_dir = GLOBALS.get("scratch_dir", GLOBALS["basedir"]) lock_file = pjoin(base_dir, "alive") if not args.enable_ui: NCURSES = False if not pexist(lock_file) or args.clearall: open(lock_file, "w").write(time.ctime()) else: clear_env() print >>sys.stderr, '\nThe same process seems to be running. Use --clearall or remove the lock file "alive" within the output dir' sys.exit(-1) try: if NCURSES: curses.wrapper(main, func, args) else: main(None, func, args) except ConfigError, e: if GLOBALS.get('_background_scheduler', None): GLOBALS['_background_scheduler'].terminate() print >>sys.stderr, "\nConfiguration Error:", e clear_env() sys.exit(-1)
def load_jobs(self): appname = self.conf[self.confname]["_app"] args = self.args.copy() args["-in"] = pjoin(GLOBALS["input_dir"], self.alg_fasta_file) args["-out"] = "clean.alg.fasta" job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid]) job.add_input_file(self.alg_fasta_file) self.jobs.append(job)
def load_jobs(self): # Only one Muscle job is necessary to run this task appname = self.conf[self.confname]["_app"] args = OrderedDict(self.args) args[""] = "%s %s" % (pjoin(GLOBALS["input_dir"], self.multiseq_file), "alg.fasta") job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid]) job.add_input_file(self.multiseq_file) self.jobs.append(job)
def load_jobs(self): appname = self.conf[self.confname]["_app"] # Only one Muscle job is necessary to run this task args = OrderedDict(self.args) args["-i"] = pjoin(GLOBALS["input_dir"], self.multiseq_file) args["-o"] = "alg.fasta" job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid]) job.cores = self.conf["threading"].get(appname, 1) job.add_input_file(self.multiseq_file) self.jobs.append(job)
def load_jobs(self): args = self.args.copy() try: del args["-wag"] except KeyError: pass if self.constrain_tree: args["-constraints"] = pjoin(GLOBALS["input_dir"], self.constrain_tree) args[pjoin(GLOBALS["input_dir"], self.alg_phylip_file)] = "" appname = self.conf[self.confname]["_app"] job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid]) job.cores = self.conf["threading"][appname] if self.constrain_tree: job.add_input_file(self.constrain_tree) job.add_input_file(self.alg_phylip_file) self.jobs.append(job)
def load_jobs(self): args = self.args.copy() args["-outfile"] = "mcoffee.fasta" alg_paths = [pjoin(GLOBALS["input_dir"], algid) for algid in self.all_alg_files] args["-aln"] = ' '.join(alg_paths) job = Job(self.conf["app"]["tcoffee"], args, parent_ids=self.parent_ids) for key in self.all_alg_files: job.add_input_file(key) self.jobs.append(job)
def load_jobs(self): appname = self.conf[self.confname]["_app"] args = OrderedDict(self.args) # Mafft redirects resulting alg to std.output. The order of # arguments is important, input file must be the last # one. args[""] = pjoin(GLOBALS["input_dir"], self.multiseq_file) job = Job(self.conf["app"][appname], args, parent_ids=[self.nodeid]) job.add_input_file(self.multiseq_file) job.cores = self.conf["threading"][appname] self.jobs.append(job)
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. alg = SeqGroup(os.path.join(self.jobs[0].jobdir, "mcoffee.fasta")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") alg_list_string = '\n'.join([pjoin(GLOBALS["input_dir"], aname) for aname in self.all_alg_files]) db.add_task_data(self.taskid, DATATYPES.alg_list, alg_list_string) AlgTask.store_data(self, fasta, phylip)
def clear_env(): try: terminate_job_launcher() except: pass base_dir = GLOBALS["basedir"] lock_file = pjoin(base_dir, "alive") try: os.remove(lock_file) except Exception: print >>sys.stderr, "could not remove lock file %s" %lock_file clear_tempdir()
def finish(self): if self.conf[self.confname]["_alg_trimming"]: # If trimming happened after mcoffee, let's save the # resulting output trim_job = self.jobs[-1] alg = SeqGroup(pjoin(trim_job.jobdir, trim_job.alg_fasta_file)) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgTask.store_data(self, fasta, phylip) else: # If no post trimming, output is just what Mcoffee # produced, so we can recycle its data ids. mc_task = self.jobs[-1] fasta_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_fasta) phylip_id = db.get_dataid(mc_task.taskid, DATATYPES.alg_phylip) db.register_task_data(self.taskid, DATATYPES.alg_fasta, fasta_id) db.register_task_data(self.taskid, DATATYPES.alg_phylip, phylip_id)
def dump_tree_debug(msg, taskdir, mtree, ttree, target_seqs, out_seqs): try: if out_seqs is None: out_seqs = set() if target_seqs is None: target_seqs = set() if ttree: for n in ttree.get_leaves(): if n.name in out_seqs: n.name = n.name + " *__OUTGROUP__*" if mtree: for n in mtree.get_leaves(): if n.name in out_seqs: n.name = n.name + " *__OUTGROUP__*" if n.name in target_seqs: n.name = n.name + " [ TARGET ]" OUT = open(pjoin(taskdir, "__debug__"), "w") print >>OUT, msg print >>OUT, "MainTree:", mtree print >>OUT, "TaskTree:", ttree print >>OUT, "Expected outgroups:", out_seqs OUT.close() except Exception, e: print e
def __init__(self, bin, args, jobname=None, parent_ids=None): # Used at execution time self.status = None # How to run the app self.bin = bin # command line arguments self.args = args # Default number of cores used by the job. If more than 1, # this attribute should be changed self.cores = 1 self.exec_type = "insitu" self.jobname = jobname # generates the unique job identifier based on the params of # the app. Some params include path names that can prevent # recycling the job, so a clean it. clean = lambda x: basename(x) if GLOBALS["basedir"] in x or GLOBALS["taskdir"] in x else x parsed_id_string = ["%s %s" %(clean(str(pair[0])), clean(str(pair[1]))) for pair in self.args.iteritems()] #print '\n'.join(map(str, self.args.items())) self.jobid = md5(','.join(sorted([md5(e) for e in parsed_id_string]))) # self.jobid = md5(','.join(sorted([md5(str(pair)) for pair in # self.args.iteritems()]))) if parent_ids: self.jobid = md5(','.join(sorted(parent_ids+[self.jobid]))) if not self.jobname: self.jobname = re.sub("[^0-9a-zA-Z]", "-", basename(self.bin)) self.ifdone_cmd = "" self.iffail_cmd = "" self.set_jobdir(pjoin(GLOBALS["tasks_dir"], self.jobid)) self.input_files = {} self.dependencies = set()
def finish(self): # Once executed, alignment is converted into relaxed # interleaved phylip format. Both files, fasta and phylip, # remain accessible. # Set Task specific attributes main_job = self.jobs[0] fasta_path = pjoin(main_job.jobdir, "clean.alg.fasta") alg = SeqGroup(fasta_path) if len(alg) != self.size: log.warning("Trimming was to aggressive and it tried" " to remove one or more sequences." " Alignment trimming will be disabled for this dataset." ) self.clean_alg_fasta_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_fasta, self.alg_fasta_file) self.clean_alg_phylip_file = db.register_task_data(self.taskid, DATATYPES.clean_alg_phylip, self.alg_phylip_file) else: for line in open(self.jobs[0].stdout_file): line = line.strip() if line.startswith("#ColumnsMap"): kept_columns = map(int, line.split("\t")[1].split(",")) fasta = alg.write(format="fasta") phylip = alg.write(format="iphylip_relaxed") AlgCleanerTask.store_data(self, fasta, phylip, kept_columns)
def load_jobs(self): readal_bin = self.conf["app"]["readal"] trimal_bin = self.conf["app"]["trimal"] input_dir = GLOBALS["input_dir"] multiseq_file = pjoin(input_dir, self.multiseq_file) multiseq_file_r = pjoin(input_dir, self.multiseq_file+"_reversed") first = seq_reverser_job(multiseq_file, multiseq_file_r, [self.nodeid], readal_bin) #print self.multiseq_file first.add_input_file(self.multiseq_file) self.jobs.append(first) all_alg_names = [] mcoffee_parents = [] for aligner_name in self.conf[self.confname]["_aligners"]: aligner_name = aligner_name[1:] _classname = APP2CLASS[self.conf[aligner_name]["_app"]] _module = __import__(CLASS2MODULE[_classname], globals(), locals(), [], -1) _aligner = getattr(_module, _classname) # Normal alg task1 = _aligner(self.nodeid, self.multiseq_file, self.seqtype, self.conf, aligner_name) task1.size = self.size self.jobs.append(task1) all_alg_names.append(task1.alg_fasta_file) # Alg of the reverse task2 = _aligner(self.nodeid, self.multiseq_file+"_reversed", self.seqtype, self.conf, aligner_name) task2.size = self.size task2.dependencies.add(first) self.jobs.append(task2) # Restore reverse alg reverse_out = pjoin(input_dir, task2.alg_fasta_file) task3 = seq_reverser_job(reverse_out, reverse_out+"_restored", [task2.taskid], readal_bin) task3.dependencies.add(task2) task3.add_input_file(task2.alg_fasta_file) all_alg_names.append(reverse_out+"_restored") self.jobs.append(task3) mcoffee_parents.extend([task1.taskid, task2.taskid]) # Combine signal from all algs using Mcoffee mcoffee_task = MCoffee(self.nodeid, self.seqtype, all_alg_names, self.conf, self.confname, parent_ids=mcoffee_parents) # reversed algs are not actually saved into db, but it should # be present since the reverser job is always executed mcoffee_task.dependencies.update(list(self.jobs)) self.jobs.append(mcoffee_task) if self.conf[self.confname]["_alg_trimming"]: trimming_cutoff = 1.0 / len(all_alg_names) targs = {} targs["-forceselect"] = pjoin(input_dir, mcoffee_task.alg_fasta_file) targs["-compareset"] = pjoin(input_dir, mcoffee_task.alg_list_file) targs["-out"] = "mcoffee.trimmed.fasta" targs["-fasta"] = "" targs["-ct"] = trimming_cutoff trim_job = Job(trimal_bin, targs, parent_ids=[mcoffee_task.taskid]) trim_job.jobname = "McoffeeTrimming" trim_job.dependencies.add(mcoffee_task) trim_job.alg_fasta_file = targs["-out"] for key in all_alg_names: trim_job.add_input_file(key) trim_job.add_input_file(mcoffee_task.alg_fasta_file) trim_job.add_input_file(mcoffee_task.alg_list_file) self.jobs.append(trim_job)
def load_jobs(self): args = OrderedDict(self.args) args["-s"] = pjoin(GLOBALS["input_dir"], self.alg_phylip_file) args["-m"] = self.model_string args["-n"] = self.alg_phylip_file if self.constrain_tree: log.log(24, "Using constrain tree %s" %self.constrain_tree) args["-g"] = pjoin(GLOBALS["input_dir"], self.constrain_tree) if self.partitions_file: log.log(24, "Using alg partitions %s" %self.partitions_file) args['-q'] = pjoin(GLOBALS["input_dir"], self.partitions_file) tree_job = Job(self.raxml_bin, args, parent_ids=[self.nodeid]) tree_job.jobname += "-"+self.model_string tree_job.cores = self.threads # Register input files necessary to run the job tree_job.add_input_file(self.alg_phylip_file) if self.constrain_tree: tree_job.add_input_file(self.constrain_tree) if self.partitions_file: tree_job.add_input_file(self.partitions_file) self.jobs.append(tree_job) self.out_tree_file = os.path.join(tree_job.jobdir, "RAxML_bestTree." + self.alg_phylip_file) if self.bootstrap == "alrt": alrt_args = tree_job.args.copy() if self.constrain_tree: del alrt_args["-g"] if self.partitions_file: alrt_args["-q"] = args['-q'] alrt_args["-f"] = "J" alrt_args["-t"] = self.out_tree_file alrt_job = Job(self.raxml_bin, alrt_args, parent_ids=[tree_job.jobid]) alrt_job.jobname += "-alrt" alrt_job.dependencies.add(tree_job) alrt_job.cores = self.threads # Register necessary input files alrt_job.add_input_file(self.alg_phylip_file) if self.partitions_file: alrt_job.add_input_file(self.partitions_file) self.jobs.append(alrt_job) self.alrt_job = alrt_job elif self.bootstrap == "alrt_phyml": alrt_args = { "-o": "n", "-i": self.alg_phylip_file, "--bootstrap": "-2", "-d": self.seqtype, "-u": self.out_tree_file, "--model": self.model, "--quiet": "", "--no_memory_check": "", } #if self.constrain_tree: # alrt_args["--constraint_tree"] = self.constrain_tree alrt_job = Job(self.conf["app"]["phyml"], alrt_args, parent_ids=[tree_job.jobid]) alrt_job.add_input_file(self.alg_phylip_file, alrt_job.jobdir) alrt_job.jobname += "-alrt" alrt_job.dependencies.add(tree_job) alrt_job.add_input_file(self.alg_phylip_file) self.jobs.append(alrt_job) self.alrt_job = alrt_job else: # Bootstrap calculation boot_args = tree_job.args.copy() boot_args["-n"] = "bootstraps."+boot_args["-n"] boot_args["-N"] = int(self.bootstrap) boot_args["-b"] = 31416 boot_job = Job(self.raxml_bin, boot_args, parent_ids=[tree_job.jobid]) boot_job.jobname += "-%d-bootstraps" %(boot_args['-N']) boot_job.dependencies.add(tree_job) boot_job.cores = self.threads # Register necessary input files boot_job.add_input_file(self.alg_phylip_file) if self.constrain_tree: boot_job.add_input_file(self.constrain_tree) if self.partitions_file: boot_job.add_input_file(self.partitions_file) self.jobs.append(boot_job) # Bootstrap drawing on top of best tree bootd_args = tree_job.args.copy() if self.constrain_tree: del bootd_args["-g"] if self.partitions_file: del bootd_args["-q"] bootd_args["-n"] = "bootstrapped."+ tree_job.args["-n"] bootd_args["-f"] = "b" bootd_args["-t"] = self.out_tree_file bootd_args["-z"] = pjoin(boot_job.jobdir, "RAxML_bootstrap." + boot_job.args["-n"]) bootd_job = Job(self.raxml_bin, bootd_args, parent_ids=[tree_job.jobid]) bootd_job.jobname += "-bootstrapped" bootd_job.dependencies.add(boot_job) bootd_job.cores = self.threads self.jobs.append(bootd_job) self.boot_job = boot_job self.bootd_job = bootd_job
e_obj = error[1] if error[1] else error[0] error_path = e_obj.jobdir if isjob(e_obj) else e_obj.taskid if e_obj is not error[0]: error_lines.append(" -> %s" %e_obj) error_lines.append(" -> %s" %error_path) error_lines.append(" -> %s" %error[2]) for eline in error_lines: log.error(eline) pending_threads = set([ts.configid for ts in pending_tasks]) finished_threads = expected_threads - (pending_threads | set(thread_errors.keys())) just_finished_lines = [] finished_lines = [] for configid in finished_threads: # configid is the the same as threadid in master tasks final_tree_file = pjoin(GLOBALS[configid]["_outpath"], GLOBALS["inputname"] + ".final_tree") threadname = GLOBALS[configid]["_name"] if configid in past_threads: log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid]) finished_lines.append("Finished %s in %d iteration(s)" %( threadname, past_threads[configid])) else: log.log(28, "Assembling final tree...") main_tree, treeiters = assembly_tree(configid) past_threads[configid] = treeiters - 1 log.log(28, "Done thread @@12:%s@@1: in %d iteration(s)", threadname, past_threads[configid])
def process_task(task, wkname, npr_conf, nodeid2info): alignerconf, alignerclass = npr_conf.aligner cleanerconf, cleanerclass = npr_conf.alg_cleaner mtesterconf, mtesterclass = npr_conf.model_tester treebuilderconf, treebuilderclass = npr_conf.tree_builder if not treebuilderclass: # Allows to dump algs in workflows with no tree tasks treebuilderclass = DummyTree splitterconf, splitterclass = npr_conf.tree_splitter conf = GLOBALS[task.configid] seqtype = task.seqtype nodeid = task.nodeid ttype = task.ttype taskid = task.taskid threadid = task.threadid node_info = nodeid2info[nodeid] size = task.size#node_info.get("size", 0) target_seqs = node_info.get("target_seqs", []) out_seqs = node_info.get("out_seqs", []) if not treebuilderclass or size < 4: # Allows to dump algs in workflows with no tree tasks or if tree # inference does not make sense given the number of sequences. DummyTree # will produce a fake fully collapsed newick tree. treebuilderclass = DummyTree mtesterclass = None # If more than one outgroup are used, enable the use of constrain if out_seqs and len(out_seqs) > 1: constrain_id = nodeid else: constrain_id = None new_tasks = [] if ttype == "msf": # Register Tree constrains constrain_tree = "(%s, (%s));" %(','.join(sorted(task.out_seqs)), ','.join(sorted(task.target_seqs))) _outs = "\n".join(map(lambda name: ">%s\n0" %name, sorted(task.out_seqs))) _tars = "\n".join(map(lambda name: ">%s\n1" %name, sorted(task.target_seqs))) constrain_alg = '\n'.join([_outs, _tars]) db.add_task_data(nodeid, DATATYPES.constrain_tree, constrain_tree) db.add_task_data(nodeid, DATATYPES.constrain_alg, constrain_alg) db.dataconn.commit() # since the creation of some Task # objects may require this info, I need # to commit right now. # Register node db.add_node(task.threadid, task.nodeid, task.cladeid, task.target_seqs, task.out_seqs) nodeid2info[nodeid]["size"] = task.size nodeid2info[nodeid]["target_seqs"] = task.target_seqs nodeid2info[nodeid]["out_seqs"] = task.out_seqs alg_task = alignerclass(nodeid, task.multiseq_file, seqtype, conf, alignerconf) alg_task.size = task.size new_tasks.append(alg_task) elif ttype == "alg" or ttype == "acleaner": if ttype == "alg": nodeid2info[nodeid]["alg_path"] = task.alg_fasta_file elif ttype == "acleaner": nodeid2info[nodeid]["alg_clean_path"] = task.clean_alg_fasta_file alg_fasta_file = getattr(task, "clean_alg_fasta_file", task.alg_fasta_file) alg_phylip_file = getattr(task, "clean_alg_phylip_file", task.alg_phylip_file) # Calculate alignment stats # cons_mean, cons_std = get_trimal_conservation(task.alg_fasta_file, # conf["app"]["trimal"]) # # max_identity = get_trimal_identity(task.alg_fasta_file, # conf["app"]["trimal"]) # log.info("Conservation: %0.2f +-%0.2f", cons_mean, cons_std) # log.info("Max. Identity: %0.2f", max_identity) #import time #t1 = time.time() #mx, mn, mean, std = get_identity(task.alg_fasta_file) #print time.time()-t1 #log.log(26, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) #t1 = time.time() if seqtype == "aa" and npr_conf.switch_aa_similarity < 1: try: alg_stats = db.get_task_data(taskid, DATATYPES.alg_stats) except Exception, e: alg_stats = {} if ttype == "alg": algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) dataid = DATATYPES.alg_phylip elif ttype == "acleaner": algfile = pjoin(GLOBALS["input_dir"], task.clean_alg_phylip_file) dataid = DATATYPES.clean_alg_phylip if "i_mean" not in alg_stats: log.log(24, "Calculating alignment stats...") # dump data if necesary algfile = pjoin(GLOBALS["input_dir"], task.alg_phylip_file) if not pexist(algfile): # dump phylip alg open(algfile, "w").write(db.get_data(db.get_dataid(taskid, dataid))) mx, mn, mean, std = get_statal_identity(algfile, conf["app"]["statal"]) alg_stats = {"i_max":mx, "i_mean":mean, "i_min":mn, "i_std":std} db.add_task_data(taskid, DATATYPES.alg_stats, alg_stats) log.log(22, "Alignment stats (sequence similarity):") log.log(22, " max: %(i_max)0.2f, min:%(i_min)0.2f, avg:%(i_mean)0.2f+-%(i_std)0.2f" % (alg_stats)) else: alg_stats = {"i_max":-1, "i_mean":-1, "i_min":-1, "i_std":-1} #print time.time()-t1 #log.log(24, "Identity: max=%0.2f min=%0.2f mean=%0.2f +- %0.2f", # mx, mn, mean, std) task.max_ident = alg_stats["i_max"] task.min_ident = alg_stats["i_min"] task.mean_ident = alg_stats["i_mean"] task.std_ident = alg_stats["i_std"] next_task = None if ttype == "alg" and cleanerclass: next_task = cleanerclass(nodeid, seqtype, alg_fasta_file, alg_phylip_file, conf, cleanerconf) else: # Converts aa alignment into nt if necessary if seqtype == "aa" and \ "nt" in GLOBALS["seqtypes"] and \ task.mean_ident >= npr_conf.switch_aa_similarity: log.log(28, "@@2:Switching to codon alignment!@@1: amino-acid sequence similarity: %0.2f >= %0.2f" %\ (task.mean_ident, npr_conf.switch_aa_similarity)) alg_fasta_file = "%s.%s" %(taskid, DATATYPES.alg_nt_fasta) alg_phylip_file = "%s.%s" %(taskid, DATATYPES.alg_nt_phylip) try: alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_fasta) alg_fasta_file = db.get_dataid(taskid, DATATYPES.alg_nt_phylip) except ValueError: log.log(22, "Calculating codon alignment...") source_alg = pjoin(GLOBALS["input_dir"], task.alg_fasta_file) if ttype == "alg": kept_columns = [] elif ttype == "acleaner": # if original alignment was trimmed, use it as reference # but make the nt alignment only on the kept columns kept_columns = db.get_task_data(taskid, DATATYPES.kept_alg_columns) if not pexist(source_alg): open(source_alg, "w").write(db.get_task_data(taskid, DATATYPES.alg_fasta)) nt_alg = switch_to_codon(source_alg, kept_columns=kept_columns) db.add_task_data(taskid, DATATYPES.alg_nt_fasta, nt_alg.write()) db.add_task_data(taskid, DATATYPES.alg_nt_phylip, nt_alg.write(format='iphylip_relaxed')) npr_conf = IterConfig(conf, wkname, task.size, "nt") seqtype = "nt" if mtesterclass: next_task = mtesterclass(nodeid, alg_fasta_file, alg_phylip_file, constrain_id, conf, mtesterconf) elif treebuilderclass: next_task = treebuilderclass(nodeid, alg_phylip_file, constrain_id, None, seqtype, conf, treebuilderconf) if next_task: next_task.size = task.size new_tasks.append(next_task)
def schedule(workflow_task_processor, pending_tasks, schedule_time, execution, debug, norender): # Adjust debug mode if debug == "all": log.setLevel(10) pending_tasks = set(pending_tasks) ## =================================== ## INITIALIZE BASIC VARS execution, run_detached = execution thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) expected_threads = set(thread2tasks.keys()) past_threads = {} thread_errors = defaultdict(list) ## END OF VARS AND SHORTCUTS ## =================================== cores_total = GLOBALS["_max_cores"] if cores_total > 0: job_queue = Queue() back_launcher = Process(target=background_job_launcher, args=(job_queue, run_detached, GLOBALS["launch_time"], cores_total)) back_launcher.start() else: job_queue = None back_launcher = None GLOBALS["_background_scheduler"] = back_launcher GLOBALS["_job_queue"] = job_queue # Captures Ctrl-C for debuging DEBUG #signal.signal(signal.SIGINT, control_c) last_report_time = None BUG = set() try: # Enters into task scheduling while pending_tasks: wtime = schedule_time # ask SGE for running jobs if execution == "sge": sgeid2jobs = db.get_sge_tasks() qstat_jobs = sge.qstat() else: qstat_jobs = None # Show summary of pending tasks per thread thread2tasks = defaultdict(list) for task in pending_tasks: thread2tasks[task.configid].append(task) set_logindent(0) log.log(28, "@@13: Updating tasks status:@@1: (%s)" % (ctime())) info_lines = [] for tid, tlist in thread2tasks.iteritems(): threadname = GLOBALS[tid]["_name"] sizelist = ["%s" %getattr(_ts, "size", "?") for _ts in tlist] info = "Thread @@13:%s@@1:: pending tasks: @@8:%s@@1: of sizes: %s" %( threadname, len(tlist), ', '.join(sizelist)) info_lines.append(info) for line in info_lines: log.log(28, line) if GLOBALS["email"] and last_report_time is None: last_report_time = time() send_mail(GLOBALS["email"], "Your NPR process has started", '\n'.join(info_lines)) ## ================================ ## CHECK AND UPDATE CURRENT TASKS checked_tasks = set() check_start_time = time() to_add_tasks = set() GLOBALS["cached_status"] = {} for task in sorted(pending_tasks, sort_tasks): # Avoids endless periods without new job submissions elapsed_time = time() - check_start_time #if not back_launcher and pending_tasks and \ # elapsed_time > schedule_time * 2: # log.log(26, "@@8:Interrupting task checks to schedule new jobs@@1:") # db.commit() # wtime = launch_jobs(sorted(pending_tasks, sort_tasks), # execution, run_detached) # check_start_time = time() # Enter debuging mode if necessary if debug and log.level > 10 and task.taskid.startswith(debug): log.setLevel(10) log.debug("ENTERING IN DEBUGGING MODE") thread2tasks[task.configid].append(task) # Update tasks and job statuses if task.taskid not in checked_tasks: try: show_task_info(task) task.status = task.get_status(qstat_jobs) db.dataconn.commit() if back_launcher and task.status not in set("DE"): for j, cmd in task.iter_waiting_jobs(): j.status = "Q" GLOBALS["cached_status"][j.jobid] = "Q" if j.jobid not in BUG: if not os.path.exists(j.jobdir): os.makedirs(j.jobdir) for ifile, outpath in j.input_files.iteritems(): try: _tid, _did = ifile.split(".") _did = int(_did) except (IndexError, ValueError): dataid = ifile else: dataid = db.get_dataid(_tid, _did) if not outpath: outfile = pjoin(GLOBALS["input_dir"], ifile) else: outfile = pjoin(outpath, ifile) if not os.path.exists(outfile): open(outfile, "w").write(db.get_data(dataid)) log.log(24, " @@8:Queueing @@1: %s from %s" %(j, task)) job_queue.put([j.jobid, j.cores, cmd, j.status_file]) BUG.add(j.jobid) update_task_states_recursively(task) db.commit() checked_tasks.add(task.taskid) except TaskError, e: log.error("Errors found in %s" %task) import traceback traceback.print_exc() if GLOBALS["email"]: threadname = GLOBALS[task.configid]["_name"] send_mail(GLOBALS["email"], "Errors found in %s!" %threadname, '\n'.join(map(str, [task, e.value, e.msg]))) pending_tasks.discard(task) thread_errors[task.configid].append([task, e.value, e.msg]) continue else: # Set temporary Queued state to avoids launching # jobs from clones task.status = "Q" if log.level < 24: show_task_info(task) if task.status == "D": #db.commit() show_task_info(task) logindent(3) # Log commands of every task if 'cmd_log_file' not in GLOBALS[task.configid]: GLOBALS[task.configid]['cmd_log_file'] = pjoin(GLOBALS[task.configid]["_outpath"], "cmd.log") O = open(GLOBALS[task.configid]['cmd_log_file'], "w") O.close() cmd_lines = get_cmd_log(task) CMD_LOG = open(GLOBALS[task.configid]['cmd_log_file'], "a") print >>CMD_LOG, task for c in cmd_lines: print >>CMD_LOG, ' '+'\t'.join(map(str, c)) CMD_LOG.close() # try: #wkname = GLOBALS[task.configid]['_name'] create_tasks = workflow_task_processor(task, task.target_wkname) except TaskError, e: log.error("Errors found in %s" %task) pending_tasks.discard(task) thread_errors[task.configid].append([task, e.value, e.msg]) continue else: logindent(-3) to_add_tasks.update(create_tasks) pending_tasks.discard(task) elif task.status == "E": log.error("task contains errors: %s " %task) log.error("Errors found in %s") pending_tasks.discard(task) thread_errors[task.configid].append([task, None, "Found (E) task status"])