def start(self): assert self.process is None, "Relaunching jobs is not allowed" try: _LOG.debug('launching %s.\n setting event' % " ".join(self._invocation)) proc_cwd = self._kwargs.get('cwd', os.curdir) k = dict(self._kwargs) stdout_file_path = self._kwargs.get('stdout', None) stderr_file_path = self._kwargs.get('stderr', None) if stdout_file_path: self._stdout_fo = open_with_intermediates( stdout_file_path, 'w') else: self._stdout_fo = open_with_intermediates(os.path.join(proc_cwd, '.Job.stdout.txt'), 'w') k['stdout'] = self._stdout_fo if stderr_file_path: self._stderr_fo = open_with_intermediates( stderr_file_path, 'w') else: self._stderr_fo = open_with_intermediates(os.path.join(proc_cwd, '.Job.stderr.txt'), 'w') k['stderr'] = self._stderr_fo self.process = Popen(self._invocation, stdin = PIPE, **k) self.set_id(self.process.pid) #f = open('.%s.pid' % self.get_id(), 'w') #f.close() _LOG.debug('setting launched_event') except: self.error = RuntimeError('The invocation:\n"%s"\nfailed' % '" "'.join(self._invocation)) raise finally: self.launched_event.set()
def start(self): assert self.process is None, "Relaunching jobs is not allowed" try: _LOG.debug("launching %s.\n setting event" % " ".join(self._invocation)) proc_cwd = self._kwargs.get("cwd", os.curdir) k = dict(self._kwargs) stdout_file_path = self._kwargs.get("stdout", None) stderr_file_path = self._kwargs.get("stderr", None) if stdout_file_path: self._stdout_fo = open_with_intermediates(stdout_file_path, "w") else: self._stdout_fo = open_with_intermediates(os.path.join(proc_cwd, ".Job.stdout.txt"), "w") k["stdout"] = self._stdout_fo if stderr_file_path: self._stderr_fo = open_with_intermediates(stderr_file_path, "w") else: self._stderr_fo = open_with_intermediates(os.path.join(proc_cwd, ".Job.stderr.txt"), "w") k["stderr"] = self._stderr_fo self.process = Popen(self._invocation, stdin=PIPE, **k) self.set_id(self.process.pid) # f = open('.%s.pid' % self.get_id(), 'w') # f.close() _LOG.debug("setting launched_event") except: self.error = RuntimeError('The invocation:\n"%s"\nfailed' % '" "'.join(self._invocation)) raise finally: self.launched_event.set()
def create_job(self, alignment, starting_tree=None, name='default', **kwargs): scratch_dir, seqfn, dt, score_fn = self._prepare_input(alignment, **kwargs) invoc = [sys.executable, self.exe, seqfn, dt, os.path.join(scratch_dir, 'output.tre'), ] score_fileobj = open_with_intermediates(score_fn, 'w') dirs_to_delete = [] if kwargs.get('delete_temps', self.delete_temps): dirs_to_delete.append(scratch_dir) def randtree_result_processor(dir=scratch_dir, to_close=score_fileobj, score_fn=score_fn, fn=os.path.join(scratch_dir, 'output.tre'), dirs_to_delete=dirs_to_delete, temp_fs=self.temp_fs): to_close.close() score = float(open(score_fn, 'rU').read().strip()) tree_str = open(fn, 'rU').read().strip() for d in dirs_to_delete: temp_fs.remove_dir(d) return (score, tree_str) job_id = kwargs.get('context_str', '') + '_randtree' job = DispatchableJob(invoc, result_processor=randtree_result_processor, cwd=scratch_dir, context_str=job_id, stdout=score_fileobj) return job
def save_to_filepath(self, filepath): if filepath is None: filepath = os.path.expanduser(os.path.join( '~', '.sate', 'sate.cfg')) f = open_with_intermediates(filepath, 'wb') for g in self.get_categories(): g.set_config_parser_fields(self._config_parser) self._config_parser.write(f) f.close()
def create_input_files(self, job_subdir, input_subdir=None): src_paths = [] seq_dir = os.path.join(self.top_dir, job_subdir) if input_subdir is not None: seq_dir = os.path.join(seq_dir, input_subdir) for i in range(5): fp = os.path.join(seq_dir, "data%d.fasta" % (i+1)) src_paths.append(fp) f = filemgr.open_with_intermediates(fp, "w") f.close() return src_paths
def start(self): assert self.process is None, "Relaunching jobs is not allowed" try: _LOG.debug('launching %s.\n setting event' % " ".join(self._invocation)) proc_cwd = self._kwargs.get('cwd', os.curdir) k = dict(self._kwargs) if 'stdout' not in self._kwargs: self._stdout_fo = open_with_intermediates(os.path.join(proc_cwd, '.Job.stdout.txt'), 'w') k['stdout'] = self._stdout_fo if 'stderr' not in self._kwargs: self._stderr_fo = open_with_intermediates(os.path.join(proc_cwd, '.Job.stderr.txt'), 'w') k['stderr'] = self._stderr_fo self.process = Popen(self._invocation, stdin = PIPE, **k) self.set_id(self.process.pid) #f = open('.%s.pid' % self.get_id(), 'w') #f.close() _LOG.debug('setting launched_event') except: self.error = RuntimeError('The invocation:\n"%s"\nfailed' % '" "'.join(self._invocation)) raise finally: self.launched_event.set()
def create_job(self, alignment, starting_tree=None, **kwargs): scratch_dir, seqfn, datatype, options = self._prepare_input(alignment, **kwargs) num_cpus = kwargs.get('num_cpus') log_file = os.path.join(scratch_dir, 'log'); invoc = [self.exe, '-quiet'] if datatype != '': invoc.extend([datatype]) model = self.model if self.model is not None else '' if model != '': model = model.split(' ') invoc.extend(model) if options is not None and len(options) >=1 : invoc.extend(options) fasttree_result = os.path.join(scratch_dir, 'results') results_fileobj = open_with_intermediates(fasttree_result, 'w') if starting_tree is not None: if isinstance(starting_tree, str): tree_str = starting_tree else: tree_str = starting_tree.compose_newick() tree_fn = os.path.join(os.path.abspath(scratch_dir), "start.tre") tree_file_obj = open(tree_fn, "w") tree_file_obj.write("%s;\n" % tree_str) tree_file_obj.close() invoc.extend(['-intree', tree_fn]) invoc.extend(['-log', log_file, seqfn ]) if num_cpus > 1: os.putenv("OMP_NUM_THREADS", str(num_cpus)) invoc[0] += 'MP' # if platform.system() == 'Windows': # x = invoc[0].split('.') # x[-2] += 'p' # invoc[0] = '.'.join(x) # else: # invoc[0] += 'p' dirs_to_delete = [] if kwargs.get('delete_temps', self.delete_temps): dirs_to_delete.append(scratch_dir) rpc = lambda : read_fasttree_results(results_fileobj, scratch_dir, fasttree_result , log_file, delete_dir=kwargs.get('delete_temps', self.delete_temps)) job_id = kwargs.get('context_str', '') + '_fasttree' job = DispatchableJob(invoc, result_processor=rpc, cwd=scratch_dir, stdout=results_fileobj, context_str=job_id) return job
def _write_partition_filepath(self, parfn, partitions, model): # partition --- list of tuples, [("DNA", 1, 30), ("DNA", 31, 60), ("PROTEIN", 61, 100)] file_obj = open_with_intermediates(parfn,'w') count = 0 for item in partitions: key = "" count += 1 if item[0] == "DNA": key = "DNA" elif item[0] == "PROTEIN": if model.startswith("PROTGAMMA"): key = model[len("PROTGAMMAI"):] if model.startswith("PROTGAMMAI") else model[len("PROTGAMMA"):] if model.startswith("PROTCAT"): key = model[len("PROTCATI"):] if model.startswith("PROTCATI") else model[len("PROTCAT"):] file_obj.write("%s, p%s=%s-%s\n" % (key, count, item[1], item[2]) ) file_obj.close()
def _write_partition_filepath(self, parfn, partitions, model): # partition --- list of tuples, [("DNA", 1, 30), ("DNA", 31, 60), ("PROTEIN", 61, 100)] file_obj = open_with_intermediates(parfn, 'w') count = 0 for item in partitions: key = "" count += 1 if item[0] == "DNA": key = "DNA" elif item[0] == "PROTEIN": if model.startswith("PROTGAMMA"): key = model[len("PROTGAMMAI"):] if model.startswith( "PROTGAMMAI") else model[len("PROTGAMMA"):] if model.startswith("PROTCAT"): key = model[len("PROTCATI"):] if model.startswith( "PROTCATI") else model[len("PROTCAT"):] file_obj.write("%s, p%s=%s-%s\n" % (key, count, item[1], item[2])) file_obj.close()
def create_job(self, alignment, guide_tree=None, **kwargs): job_id = kwargs.get('context_str', '') + '_mafft' if alignment.get_num_taxa() == 1: return FakeJob(alignment, context_str=job_id) scratch_dir, seqfn, alignedfn = self._prepare_input(alignment, **kwargs) aligned_fileobj = open_with_intermediates(alignedfn, 'w') invoc = [] if platform.system() == "Windows": invoc.append(self.exe) else: invoc.extend([sys.executable, self.exe]) if len(alignment) <= 200 and alignment.max_sequence_length() < 10000: invoc.extend(['--localpair', '--maxiterate', '1000']) if '--ep' not in self.user_opts: invoc.extend(['--ep', '0.123']) invoc.extend(['--quiet', seqfn]) invoc.extend(self.user_opts) # The MAFFT job creation is slightly different from the other # aligners because we redirect and read standard output. dirs_to_delete = [] if kwargs.get('delete_temps', self.delete_temps): dirs_to_delete.append(scratch_dir) def mafft_result_processor(to_close=aligned_fileobj, fn=alignedfn, datatype=alignment.datatype, dirs_to_delete=dirs_to_delete, temp_fs=self.temp_fs): to_close.close() return read_internal_alignment(fn=alignedfn, datatype=datatype, dirs_to_delete=dirs_to_delete, temp_fs=temp_fs) job = DispatchableJob(invoc, result_processor=mafft_result_processor, cwd=scratch_dir, stdout=aligned_fileobj, context_str=job_id) return job
def finish_sate_execution(sate_team, user_config, temporaries_dir, multilocus_dataset, sate_products): global _RunningJobs # get the RAxML model #TODO: this should check for the tree_estimator. Currently we only support raxml, so this works... model = user_config.raxml.model options = user_config.commandline user_config.save_to_filepath(os.path.join(temporaries_dir, 'last_used.cfg')) if options.timesfile: f = open_with_intermediates(options.timesfile, 'a') f.close() set_timing_log_filepath(options.timesfile) ############################################################################ # We must read the incoming tree in before we call the get_sequences_for_sate # function that relabels that taxa in the dataset ###### tree_file = options.treefile if tree_file: if not os.path.exists(tree_file): raise Exception('The tree file "%s" does not exist' % tree_file) tree_f = open(tree_file, 'rU') MESSENGER.send_info('Reading starting trees from "%s"...' % tree_file) tree_list = read_and_encode_splits(multilocus_dataset.dataset, tree_f) tree_f.close() if len(tree_list) > 1: MESSENGER.send_warning('%d starting trees found in "%s". The first tree will be used.' % (len(tree_list), tree_file)) starting_tree = tree_list[0] score = None ############################################################################ # This will relabel the taxa if they have problematic names ##### multilocus_dataset.relabel_for_sate() options.aligned = all( [i.is_aligned() for i in multilocus_dataset] ) ############################################################################ # Launch threads to do work ##### sate_config = user_config.get("sate") start_worker(sate_config.num_cpus) ############################################################################ # Be prepared to kill any long running jobs ##### prev_signals = [] for sig in [signal.SIGTERM, signal.SIGABRT, signal.SIGINT]: # signal.SIGABRT, signal.SIGBUS, signal.SIGINT, signal.SIGKILL, signal.SIGSTOP]: prev_handler = signal.signal(sig, killed_handler) prev_signals.append((sig, prev_handler)) try: if tree_file: # getting the newick string here will allow us to get a string that is in terms of the correct taxon labels starting_tree_str = starting_tree.compose_newick() else: MESSENGER.send_info("Performing initial tree search to get starting tree...") if not options.aligned: MESSENGER.send_info("Performing initial alignment of the entire data matrix...") init_aln_dir = os.path.join(temporaries_dir, 'init_aln') init_aln_dir = sate_team.temp_fs.create_subdir(init_aln_dir) delete_aln_temps = not (options.keeptemp and options.keepalignmenttemps) new_alignment_list= [] for unaligned_seqs in multilocus_dataset: job = sate_team.aligner.create_job(unaligned_seqs, tmp_dir_par=init_aln_dir, context_str="initalign", delete_temps=delete_aln_temps) _RunningJobs = job jobq.put(job) new_alignment = job.get_results() _RunningJobs = None new_alignment_list.append(new_alignment) for locus_index, new_alignment in enumerate(new_alignment_list): multilocus_dataset[locus_index] = new_alignment if delete_aln_temps: sate_team.temp_fs.remove_dir(init_aln_dir) else: MESSENGER.send_info("Input sequences assumed to be aligned (based on sequence lengths).") MESSENGER.send_info("Performing initial tree search to get starting tree...") init_tree_dir = os.path.join(temporaries_dir, 'init_tree') init_tree_dir = sate_team.temp_fs.create_subdir(init_tree_dir) delete_tree_temps = not options.keeptemp job = sate_team.tree_estimator.create_job(multilocus_dataset, tmp_dir_par=init_tree_dir, num_cpus=sate_config.num_cpus, context_str="inittree", delete_temps=delete_tree_temps) _RunningJobs = job jobq.put(job) score, starting_tree_str = job.get_results() _RunningJobs = None if delete_tree_temps: sate_team.temp_fs.remove_dir(init_tree_dir) _LOG.debug('We have the tree and whole_alignment, partitions...') sate_config_dict = sate_config.dict() if options.keeptemp: sate_config_dict['keep_iteration_temporaries'] = True if options.keepalignmenttemps: sate_config_dict['keep_realignment_temporaries'] = True job = SateJob(multilocus_dataset=multilocus_dataset, sate_team=sate_team, name=options.job, status_messages=MESSENGER.send_info, **sate_config_dict) job.tree_str = starting_tree_str if score is not None: job.store_optimum_results(new_multilocus_dataset=multilocus_dataset, new_tree_str=starting_tree_str, new_score=score, curr_timestamp=time.time()) _RunningJobs = job MESSENGER.send_info("Starting SATe algorithm on initial tree...") job.run(tmp_dir_par=temporaries_dir) _RunningJobs = None job.multilocus_dataset.restore_taxon_names() assert len(sate_products.alignment_streams) == len(job.multilocus_dataset) for i, alignment in enumerate(job.multilocus_dataset): alignment_stream = sate_products.alignment_streams[i] MESSENGER.send_info("Writing final alignment to %s" % alignment_stream.name) alignment.write(alignment_stream, file_format="FASTA") alignment_stream.close() MESSENGER.send_info("Writing final tree to %s" % sate_products.tree_stream.name) tree_str = job.tree.compose_newick() sate_products.tree_stream.write("%s;\n" % tree_str) #outtree_fn = options.result #if outtree_fn is None: # if options.multilocus: # outtree_fn = os.path.join(seqdir, "combined_%s.tre" % options.job) # else: # outtree_fn = aln_filename + ".tre" #MESSENGER.send_info("Writing final tree to %s" % outtree_fn) #tree_str = job.tree.compose_newick() #sate_products.tree_stream.write("%s;\n" % tree_str) MESSENGER.send_info("Writing final likelihood score to %s" % sate_products.score_stream.name) sate_products.score_stream.write("%s\n" % job.score) finally: for el in prev_signals: sig, prev_handler = el if prev_handler is None: signal.signal(sig, signal.SIG_DFL) else: signal.signal(sig, prev_handler)
def finish_sate_execution(sate_team, user_config, temporaries_dir, multilocus_dataset, sate_products): global _RunningJobs # get the RAxML model #TODO: this should check for the tree_estimator. Currently we only support raxml, so this works... model = user_config.raxml.model options = user_config.commandline user_config.save_to_filepath(os.path.join(temporaries_dir, 'last_used.cfg')) if options.timesfile: f = open_with_intermediates(options.timesfile, 'a') f.close() set_timing_log_filepath(options.timesfile) ############################################################################ # We must read the incoming tree in before we call the get_sequences_for_sate # function that relabels that taxa in the dataset ###### alignment_as_tmp_filename_to_report = None tree_as_tmp_filename_to_report = None tree_file = options.treefile if tree_file: if not os.path.exists(tree_file): raise Exception('The tree file "%s" does not exist' % tree_file) tree_f = open(tree_file, 'rU') MESSENGER.send_info('Reading starting trees from "%s"...' % tree_file) try: tree_list = read_and_encode_splits(multilocus_dataset.dataset, tree_f, starting_tree=True) except KeyError: MESSENGER.send_error("Error in reading the treefile, probably due to a name in the tree that does not match the names in the input sequence files.\n") raise except: MESSENGER.send_error("Error in reading the treefile.\n") raise tree_f.close() if len(tree_list) > 1: MESSENGER.send_warning('%d starting trees found in "%s". The first tree will be used.' % (len(tree_list), tree_file)) starting_tree = tree_list[0] score = None tree_as_tmp_filename_to_report = tree_file ############################################################################ # This will relabel the taxa if they have problematic names ##### multilocus_dataset.relabel_for_sate() ############################################################################ # This ensures all nucleotide data is DNA internally ##### restore_to_rna = False if user_config.commandline.datatype.upper() == 'RNA': multilocus_dataset.convert_rna_to_dna() user_config.commandline.datatype = 'DNA' restore_to_rna = True export_names = True if export_names: try: name_filename = sate_products.get_abs_path_for_tag('name_translation.txt') name_output = open(name_filename, 'w') safe2real = multilocus_dataset.safe_to_real_names safe_list = safe2real.keys() safe_list.sort() for safe in safe_list: orig = safe2real[safe][0] name_output.write("%s\n%s\n\n" % (safe, orig)) name_output.close() MESSENGER.send_info("Name translation information saved to %s as safe name, original name, blank line format." % name_filename) except: MESSENGER.send_info("Error exporting saving name translation to %s" % name_filename) if options.aligned: options.aligned = all( [i.is_aligned() for i in multilocus_dataset] ) ############################################################################ # Launch threads to do work ##### sate_config = user_config.get("sate") start_worker(sate_config.num_cpus) ############################################################################ # Be prepared to kill any long running jobs ##### prev_signals = [] for sig in [signal.SIGTERM, signal.SIGABRT, signal.SIGINT]: # signal.SIGABRT, signal.SIGBUS, signal.SIGINT, signal.SIGKILL, signal.SIGSTOP]: prev_handler = signal.signal(sig, killed_handler) prev_signals.append((sig, prev_handler)) try: if (not options.two_phase) and tree_file: # getting the newick string here will allow us to get a string that is in terms of the correct taxon labels starting_tree_str = starting_tree.compose_newick() else: if not options.two_phase: MESSENGER.send_info("Creating a starting tree for the SATe algorithm...") if (options.two_phase) or (not options.aligned): MESSENGER.send_info("Performing initial alignment of the entire data matrix...") init_aln_dir = os.path.join(temporaries_dir, 'init_aln') init_aln_dir = sate_team.temp_fs.create_subdir(init_aln_dir) delete_aln_temps = not (options.keeptemp and options.keepalignmenttemps) new_alignment_list= [] aln_job_list = [] for unaligned_seqs in multilocus_dataset: job = sate_team.aligner.create_job(unaligned_seqs, tmp_dir_par=init_aln_dir, context_str="initalign", delete_temps=delete_aln_temps) aln_job_list.append(job) _RunningJobs = aln_job_list for job in aln_job_list: jobq.put(job) for job in aln_job_list: new_alignment = job.get_results() new_alignment_list.append(new_alignment) _RunningJobs = None for locus_index, new_alignment in enumerate(new_alignment_list): multilocus_dataset[locus_index] = new_alignment if delete_aln_temps: sate_team.temp_fs.remove_dir(init_aln_dir) else: MESSENGER.send_info("Input sequences assumed to be aligned (based on sequence lengths).") MESSENGER.send_info("Performing initial tree search to get starting tree...") init_tree_dir = os.path.join(temporaries_dir, 'init_tree') init_tree_dir = sate_team.temp_fs.create_subdir(init_tree_dir) delete_tree_temps = not options.keeptemp job = sate_team.tree_estimator.create_job(multilocus_dataset, tmp_dir_par=init_tree_dir, num_cpus=sate_config.num_cpus, context_str="inittree", delete_temps=delete_tree_temps, sate_products=sate_products, step_num='initialsearch') _RunningJobs = job jobq.put(job) score, starting_tree_str = job.get_results() _RunningJobs = None alignment_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output("initialsearch", TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True) tree_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output("initialsearch", TEMP_TREE_TAG, allow_existing=True) if delete_tree_temps: sate_team.temp_fs.remove_dir(init_tree_dir) _LOG.debug('We have the tree and whole_alignment, partitions...') sate_config_dict = sate_config.dict() if options.keeptemp: sate_config_dict['keep_iteration_temporaries'] = True if options.keepalignmenttemps: sate_config_dict['keep_realignment_temporaries'] = True job = SateJob(multilocus_dataset=multilocus_dataset, sate_team=sate_team, name=options.job, status_messages=MESSENGER.send_info, score=score, **sate_config_dict) job.tree_str = starting_tree_str job.curr_iter_align_tmp_filename = alignment_as_tmp_filename_to_report job.curr_iter_tree_tmp_filename = tree_as_tmp_filename_to_report if score is not None: job.store_optimum_results(new_multilocus_dataset=multilocus_dataset, new_tree_str=starting_tree_str, new_score=score, curr_timestamp=time.time()) if options.two_phase: MESSENGER.send_info("Exiting with the initial tree because the SATe algorithm is avoided when the --two-phase option is used.") else: _RunningJobs = job MESSENGER.send_info("Starting SATe algorithm on initial tree...") job.run(tmp_dir_par=temporaries_dir, sate_products=sate_products) _RunningJobs = None if job.return_final_tree_and_alignment: alignment_as_tmp_filename_to_report = job.curr_iter_align_tmp_filename else: alignment_as_tmp_filename_to_report = job.best_alignment_tmp_filename if user_config.commandline.raxml_search_after: raxml_model = user_config.raxml.model.strip() if not raxml_model: dt = user_config.commandline.datatype mf = sate_team.tree_estimator.model ms = fasttree_to_raxml_model_str(dt, mf) sate_team.raxml_tree_estimator.model = ms rte = sate_team.raxml_tree_estimator MESSENGER.send_info("Performing post-processing tree search in RAxML...") post_tree_dir = os.path.join(temporaries_dir, 'post_tree') post_tree_dir = sate_team.temp_fs.create_subdir(post_tree_dir) delete_tree_temps = not options.keeptemp starting_tree = None if user_config.sate.start_tree_search_from_current: starting_tree = job.tree post_job = rte.create_job(job.multilocus_dataset, starting_tree=starting_tree, num_cpus=sate_config.num_cpus, context_str="postraxtree", tmp_dir_par=post_tree_dir, delete_temps=delete_tree_temps, sate_products=sate_products, step_num="postraxtree") _RunningJobs = post_job jobq.put(post_job) post_score, post_tree = post_job.get_results() _RunningJobs = None tree_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output("postraxtree", TEMP_TREE_TAG, allow_existing=True) if delete_tree_temps: sate_team.temp_fs.remove_dir(post_tree_dir) job.tree_str = post_tree job.score = post_score if post_score > job.best_score: job.best_tree_str = post_tree job.best_score = post_score else: if job.return_final_tree_and_alignment: tree_as_tmp_filename_to_report = job.curr_iter_tree_tmp_filename else: tree_as_tmp_filename_to_report = job.best_tree_tmp_filename ####################################################################### # Restore original taxon names and RNA characters ##### job.multilocus_dataset.restore_taxon_names() if restore_to_rna: job.multilocus_dataset.convert_dna_to_rna() user_config.commandline.datatype = 'RNA' assert len(sate_products.alignment_streams) == len(job.multilocus_dataset) for i, alignment in enumerate(job.multilocus_dataset): alignment_stream = sate_products.alignment_streams[i] MESSENGER.send_info("Writing resulting alignment to %s" % alignment_stream.name) alignment.write(alignment_stream, file_format="FASTA") alignment_stream.close() MESSENGER.send_info("Writing resulting tree to %s" % sate_products.tree_stream.name) tree_str = job.tree.compose_newick() sate_products.tree_stream.write("%s;\n" % tree_str) #outtree_fn = options.result #if outtree_fn is None: # if options.multilocus: # outtree_fn = os.path.join(seqdir, "combined_%s.tre" % options.job) # else: # outtree_fn = aln_filename + ".tre" #MESSENGER.send_info("Writing resulting tree to %s" % outtree_fn) #tree_str = job.tree.compose_newick() #sate_products.tree_stream.write("%s;\n" % tree_str) MESSENGER.send_info("Writing resulting likelihood score to %s" % sate_products.score_stream.name) sate_products.score_stream.write("%s\n" % job.score) if alignment_as_tmp_filename_to_report is not None: MESSENGER.send_info('The resulting alignment (with the names in a "safe" form) was first written as the file "%s"' % alignment_as_tmp_filename_to_report) if tree_as_tmp_filename_to_report is not None: MESSENGER.send_info('The resulting tree (with the names in a "safe" form) was first written as the file "%s"' % tree_as_tmp_filename_to_report) finally: for el in prev_signals: sig, prev_handler = el if prev_handler is None: signal.signal(sig, signal.SIG_DFL) else: signal.signal(sig, prev_handler)
def finish_sate_execution(sate_team, user_config, temporaries_dir, multilocus_dataset, sate_products): global _RunningJobs # get the RAxML model #TODO: this should check for the tree_estimator. Currently we only support raxml, so this works... model = user_config.raxml.model options = user_config.commandline user_config.save_to_filepath(os.path.join(temporaries_dir, 'last_used.cfg')) if options.timesfile: f = open_with_intermediates(options.timesfile, 'a') f.close() set_timing_log_filepath(options.timesfile) ############################################################################ # We must read the incoming tree in before we call the get_sequences_for_sate # function that relabels that taxa in the dataset ###### alignment_as_tmp_filename_to_report = None tree_as_tmp_filename_to_report = None tree_file = options.treefile if tree_file: if not os.path.exists(tree_file): raise Exception('The tree file "%s" does not exist' % tree_file) tree_f = open(tree_file, 'rU') MESSENGER.send_info('Reading starting trees from "%s"...' % tree_file) try: tree_list = read_and_encode_splits(multilocus_dataset.dataset, tree_f, starting_tree=True) except KeyError: MESSENGER.send_error( "Error in reading the treefile, probably due to a name in the tree that does not match the names in the input sequence files.\n" ) raise except: MESSENGER.send_error("Error in reading the treefile.\n") raise tree_f.close() if len(tree_list) > 1: MESSENGER.send_warning( '%d starting trees found in "%s". The first tree will be used.' % (len(tree_list), tree_file)) starting_tree = tree_list[0] score = None tree_as_tmp_filename_to_report = tree_file ############################################################################ # This will relabel the taxa if they have problematic names ##### multilocus_dataset.relabel_for_sate() ############################################################################ # This ensures all nucleotide data is DNA internally ##### restore_to_rna = False if user_config.commandline.datatype.upper() == 'RNA': multilocus_dataset.convert_rna_to_dna() user_config.commandline.datatype = 'DNA' restore_to_rna = True export_names = True if export_names: try: name_filename = sate_products.get_abs_path_for_tag( 'name_translation.txt') name_output = open(name_filename, 'w') safe2real = multilocus_dataset.safe_to_real_names safe_list = safe2real.keys() safe_list.sort() for safe in safe_list: orig = safe2real[safe][0] name_output.write("%s\n%s\n\n" % (safe, orig)) name_output.close() MESSENGER.send_info( "Name translation information saved to %s as safe name, original name, blank line format." % name_filename) except: MESSENGER.send_info( "Error exporting saving name translation to %s" % name_filename) if options.aligned: options.aligned = all([i.is_aligned() for i in multilocus_dataset]) ############################################################################ # Launch threads to do work ##### sate_config = user_config.get("sate") start_worker(sate_config.num_cpus) ############################################################################ # Be prepared to kill any long running jobs ##### prev_signals = [] for sig in [ signal.SIGTERM, signal.SIGABRT, signal.SIGINT ]: # signal.SIGABRT, signal.SIGBUS, signal.SIGINT, signal.SIGKILL, signal.SIGSTOP]: prev_handler = signal.signal(sig, killed_handler) prev_signals.append((sig, prev_handler)) try: if (not options.two_phase) and tree_file: # getting the newick string here will allow us to get a string that is in terms of the correct taxon labels starting_tree_str = starting_tree.compose_newick() else: if not options.two_phase: MESSENGER.send_info( "Creating a starting tree for the SATe algorithm...") if (options.two_phase) or (not options.aligned): MESSENGER.send_info( "Performing initial alignment of the entire data matrix..." ) init_aln_dir = os.path.join(temporaries_dir, 'init_aln') init_aln_dir = sate_team.temp_fs.create_subdir(init_aln_dir) delete_aln_temps = not (options.keeptemp and options.keepalignmenttemps) new_alignment_list = [] aln_job_list = [] for unaligned_seqs in multilocus_dataset: job = sate_team.aligner.create_job( unaligned_seqs, tmp_dir_par=init_aln_dir, context_str="initalign", delete_temps=delete_aln_temps) aln_job_list.append(job) _RunningJobs = aln_job_list for job in aln_job_list: jobq.put(job) for job in aln_job_list: new_alignment = job.get_results() new_alignment_list.append(new_alignment) _RunningJobs = None for locus_index, new_alignment in enumerate( new_alignment_list): multilocus_dataset[locus_index] = new_alignment if delete_aln_temps: sate_team.temp_fs.remove_dir(init_aln_dir) else: MESSENGER.send_info( "Input sequences assumed to be aligned (based on sequence lengths)." ) MESSENGER.send_info( "Performing initial tree search to get starting tree...") init_tree_dir = os.path.join(temporaries_dir, 'init_tree') init_tree_dir = sate_team.temp_fs.create_subdir(init_tree_dir) delete_tree_temps = not options.keeptemp job = sate_team.tree_estimator.create_job( multilocus_dataset, tmp_dir_par=init_tree_dir, num_cpus=sate_config.num_cpus, context_str="inittree", delete_temps=delete_tree_temps, sate_products=sate_products, step_num='initialsearch') _RunningJobs = job jobq.put(job) score, starting_tree_str = job.get_results() _RunningJobs = None alignment_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output( "initialsearch", TEMP_SEQ_ALIGNMENT_TAG, allow_existing=True) tree_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output( "initialsearch", TEMP_TREE_TAG, allow_existing=True) if delete_tree_temps: sate_team.temp_fs.remove_dir(init_tree_dir) _LOG.debug('We have the tree and whole_alignment, partitions...') sate_config_dict = sate_config.dict() if options.keeptemp: sate_config_dict['keep_iteration_temporaries'] = True if options.keepalignmenttemps: sate_config_dict['keep_realignment_temporaries'] = True job = SateJob(multilocus_dataset=multilocus_dataset, sate_team=sate_team, name=options.job, status_messages=MESSENGER.send_info, score=score, **sate_config_dict) job.tree_str = starting_tree_str job.curr_iter_align_tmp_filename = alignment_as_tmp_filename_to_report job.curr_iter_tree_tmp_filename = tree_as_tmp_filename_to_report if score is not None: job.store_optimum_results( new_multilocus_dataset=multilocus_dataset, new_tree_str=starting_tree_str, new_score=score, curr_timestamp=time.time()) if options.two_phase: MESSENGER.send_info( "Exiting with the initial tree because the SATe algorithm is avoided when the --two-phase option is used." ) else: _RunningJobs = job MESSENGER.send_info("Starting SATe algorithm on initial tree...") job.run(tmp_dir_par=temporaries_dir, sate_products=sate_products) _RunningJobs = None if job.return_final_tree_and_alignment: alignment_as_tmp_filename_to_report = job.curr_iter_align_tmp_filename else: alignment_as_tmp_filename_to_report = job.best_alignment_tmp_filename if user_config.commandline.raxml_search_after: raxml_model = user_config.raxml.model.strip() if not raxml_model: dt = user_config.commandline.datatype mf = sate_team.tree_estimator.model ms = fasttree_to_raxml_model_str(dt, mf) sate_team.raxml_tree_estimator.model = ms rte = sate_team.raxml_tree_estimator MESSENGER.send_info( "Performing post-processing tree search in RAxML...") post_tree_dir = os.path.join(temporaries_dir, 'post_tree') post_tree_dir = sate_team.temp_fs.create_subdir(post_tree_dir) delete_tree_temps = not options.keeptemp starting_tree = None if user_config.sate.start_tree_search_from_current: starting_tree = job.tree post_job = rte.create_job(job.multilocus_dataset, starting_tree=starting_tree, num_cpus=sate_config.num_cpus, context_str="postraxtree", tmp_dir_par=post_tree_dir, delete_temps=delete_tree_temps, sate_products=sate_products, step_num="postraxtree") _RunningJobs = post_job jobq.put(post_job) post_score, post_tree = post_job.get_results() _RunningJobs = None tree_as_tmp_filename_to_report = sate_products.get_abs_path_for_iter_output( "postraxtree", TEMP_TREE_TAG, allow_existing=True) if delete_tree_temps: sate_team.temp_fs.remove_dir(post_tree_dir) job.tree_str = post_tree job.score = post_score if post_score > job.best_score: job.best_tree_str = post_tree job.best_score = post_score else: if job.return_final_tree_and_alignment: tree_as_tmp_filename_to_report = job.curr_iter_tree_tmp_filename else: tree_as_tmp_filename_to_report = job.best_tree_tmp_filename ####################################################################### # Restore original taxon names and RNA characters ##### job.multilocus_dataset.restore_taxon_names() if restore_to_rna: job.multilocus_dataset.convert_dna_to_rna() user_config.commandline.datatype = 'RNA' assert len(sate_products.alignment_streams) == len( job.multilocus_dataset) for i, alignment in enumerate(job.multilocus_dataset): alignment_stream = sate_products.alignment_streams[i] MESSENGER.send_info("Writing resulting alignment to %s" % alignment_stream.name) alignment.write(alignment_stream, file_format="FASTA") alignment_stream.close() MESSENGER.send_info("Writing resulting tree to %s" % sate_products.tree_stream.name) tree_str = job.tree.compose_newick() sate_products.tree_stream.write("%s;\n" % tree_str) #outtree_fn = options.result #if outtree_fn is None: # if options.multilocus: # outtree_fn = os.path.join(seqdir, "combined_%s.tre" % options.job) # else: # outtree_fn = aln_filename + ".tre" #MESSENGER.send_info("Writing resulting tree to %s" % outtree_fn) #tree_str = job.tree.compose_newick() #sate_products.tree_stream.write("%s;\n" % tree_str) MESSENGER.send_info("Writing resulting likelihood score to %s" % sate_products.score_stream.name) sate_products.score_stream.write("%s\n" % job.score) if alignment_as_tmp_filename_to_report is not None: MESSENGER.send_info( 'The resulting alignment (with the names in a "safe" form) was first written as the file "%s"' % alignment_as_tmp_filename_to_report) if tree_as_tmp_filename_to_report is not None: MESSENGER.send_info( 'The resulting tree (with the names in a "safe" form) was first written as the file "%s"' % tree_as_tmp_filename_to_report) finally: for el in prev_signals: sig, prev_handler = el if prev_handler is None: signal.signal(sig, signal.SIG_DFL) else: signal.signal(sig, prev_handler)