class Analysis(object): """Performs the analysis and collects the results""" def __init__(self, cfg, rpt, force_restart=False, save_phyml=False, threads=-1): cfg.validate() self.cfg = cfg self.rpt = rpt self.threads = threads self.save_phyml = save_phyml self.results = results.AnalysisResults() log.info("Beginning Analysis") if force_restart: # Remove everything if os.path.exists(self.cfg.output_path): log.warning("Deleting all previous workings in '%s'", self.cfg.output_path) shutil.rmtree(self.cfg.output_path) else: # Just remove the schemes folder if os.path.exists(self.cfg.schemes_path): log.info("Removing Schemes in '%s' (they will be " "recalculated from existing subset data)", self.cfg.schemes_path) shutil.rmtree(self.cfg.schemes_path) #check for old analyses to see if we can use the old data self.cfg.check_for_old_config() # Make some folders for the analysis self.cfg.make_output_folders() self.make_alignment(cfg.alignment_path) self.make_tree(cfg.user_tree_topology_path) self.subsets_analysed_set = set() #a counter for user info self.subsets_analysed = 0 #a counter for user info self.total_subset_num = None self.schemes_analysed = 0 #a counter for user info self.total_scheme_num = None def analyse(self): self.do_analysis() self.results.finalise() self.report() return self.results def report(self): best = [ ("Best scheme according to AIC", self.results.best_aic), ("Best scheme according to AICc", self.results.best_aicc), ("Best scheme according to BIC", self.results.best_bic), ] self.rpt.write_best_schemes(best) self.rpt.write_all_schemes(self.results) def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error("Alignment file has changed since previous run. " "You need to use the force-restart option.") raise AnalysisError else: self.alignment.write(self.alignment_path) def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset.Subset(*list(self.cfg.partitions)) self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Now we've written this alignment, we need to lock everything in # place, no more adding partitions, or changing them from now on. self.cfg.partitions.check_against_alignment(self.alignment) self.cfg.partitions.finalise() # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') # Now check for the tree tree_path = phyml.make_tree_path(self.filtered_alignment_path) if not os.path.exists(tree_path): # If we have a user tree, then use that, otherwise, create a topology if user_path != None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s", user_path) topology_path = os.path.join(self.cfg.start_tree_path, 'user_topology.phy') phyml.dupfile(user_path, topology_path) else: topology_path = phyml.make_topology(self.filtered_alignment_path, self.cfg.datatype) # Now estimate branch lengths if self.cfg.datatype == "DNA": tree_path = phyml.make_branch_lengths(self.filtered_alignment_path, topology_path) elif self.cfg.datatype == "protein": tree_path = phyml.make_branch_lengths_protein(self.filtered_alignment_path, topology_path) self.tree_path = tree_path log.info("Starting tree with branch lengths is here: %s", self.tree_path) def analyse_subset(self, sub, models): """Analyse the subset using the models given This is the core place where everything comes together The results are placed into subset.result """ log.debug("About to analyse %s using models %s", sub, ", ".join(list(models))) #keep people informed about what's going on #if we don't know the total subset number, we can usually get it like this if self.total_subset_num == None: self.total_subset_num = len(sub._cache) old_num_analysed = self.subsets_analysed self.subsets_analysed_set.add(sub.name) self.subsets_analysed = len(self.subsets_analysed_set) if self.subsets_analysed>old_num_analysed: #we've just analysed a subset we haven't seen yet percent_done = float(self.subsets_analysed)*100.0/float(self.total_subset_num) log.info("Analysing subset %d/%d: %.2f%s done" %(self.subsets_analysed,self.total_subset_num, percent_done, r"%")) subset_cache_path = os.path.join(self.cfg.subsets_path, sub.name + '.bin') # We might have already saved a bunch of results, try there first if not sub.results: log.debug("Reading in cached data from the subsets file") sub.read_cache(subset_cache_path) # First, see if we've already got the results loaded. Then we can # shortcut all the other checks models_done = set(sub.results.keys()) log.debug("These models have already been done: %s", models_done) models_required = set(models) models_to_do = models_required - models_done log.debug("Which leaves these models still to analyse: %s", models_to_do) # Empty set means we're done if not models_to_do: log.debug("All models already done, so using just the cached results for subset %s", sub) #if models_done!=set(models): #redo model selection if we have different models sub.model_selection(self.cfg.model_selection, self.cfg.models) return # Make an Alignment from the source, using this subset sub_alignment = SubsetAlignment(self.alignment, sub) sub_path = os.path.join(self.cfg.phyml_path, sub.name + '.phy') # Add it into the sub, so we keep it around sub.alignment_path = sub_path # Maybe it is there already? if os.path.exists(sub_path): log.debug("Found existing alignment file %s", sub_path) old_align = Alignment() old_align.read(sub_path) # It had better be the same! if not old_align.same_as(sub_alignment): log.error("It looks like you have changed one or more of the" "data_blocks in the configuration file, " "so the new subset alignments" " don't match the ones stored for this analysis." "You'll need to run the program with --force-restart") raise AnalysisError else: # We need to write it sub_alignment.write(sub_path) # Try and read in some previous analyses log.debug("Checking for old results in the phyml folder") self.parse_results(sub, models_to_do) if not models_to_do: #if models_done!=set(models): #redo model selection if we have different models sub.model_selection(self.cfg.model_selection, self.cfg.models) return # What is left, we actually have to analyse... tasks = [] #for efficiency, we rank the models by their difficulty - most difficult first difficulty = [] for m in models_to_do: difficulty.append(get_model_difficulty(m)) #hat tip to http://scienceoss.com/sort-one-list-by-another-list/ difficulty_and_m = zip(difficulty, models_to_do) difficulty_and_m.sort(reverse=True) sorted_difficulty, sorted_models_to_do = zip(*difficulty_and_m) log.debug("About to analyse these models, in this order: %s", sorted_models_to_do) for m in sorted_models_to_do: #a_path, out_path = phyml.make_analysis_path(self.cfg.phyml_path, sub.name, m) tasks.append((phyml.analyse, (m, sub_path, self.tree_path, self.cfg.branchlengths))) if self.threads == 1: self.run_models_concurrent(tasks) else: self.run_models_threaded(tasks) # Now parse the models we've just done self.parse_results(sub, models_to_do) # This should be empty NOW! if models_to_do: log.error("Failed to run models %s; not sure why", ", ".join(list(models_to_do))) raise AnalysisError # Now we have analysed all models for this subset, we do model selection # but ONLY on the models specified in the cfg file. sub.model_selection(self.cfg.model_selection, self.cfg.models) # If we made it to here, we should write out the new summary self.rpt.write_subset_summary(sub) # We also need to update this sub.write_cache(subset_cache_path) def parse_results(self, sub, models_to_do): """Read in the results and parse them""" models_done = [] for m in list(models_to_do): # sub.alignment_path stats_path, tree_path = phyml.make_output_path(sub.alignment_path, m) if os.path.exists(stats_path): sub_output = open(stats_path, 'rb').read() # Annotate with the parameters of the model try: result = phyml.parse(sub_output) sub.add_model_result(m, result) # Remove the current model from remaining ones models_to_do.remove(m) # Just used for below models_done.append(m) if self.save_phyml: pass else: os.remove(stats_path) os.remove(tree_path) except phyml.PhymlError: log.warning("Failed loading parse output from %s." "Output maybe corrupted. I'll run it again.", stats_path) if models_done: log.debug("Loaded analysis for %s, models %s", sub, ", ".join(models_done)) def run_models_concurrent(self, tasks): for func, args in tasks: func(*args) def run_models_threaded(self, tasks): pool = threadpool.Pool(tasks, self.threads) pool.join() def analyse_scheme(self, sch, models): self.schemes_analysed = self.schemes_analysed + 1 log.info("Analysing scheme %d/%d" %(self.schemes_analysed, self.total_scheme_num)) for sub in sch: self.analyse_subset(sub, models) # AIC needs the number of sequences number_of_seq = len(self.alignment.species) result = scheme.SchemeResult(sch, number_of_seq, self.cfg.branchlengths) self.results.add_scheme_result(result) # TODO: should put all paths into config. Then reporter should decide # whether to create stuff fname = os.path.join(self.cfg.schemes_path, sch.name+'.txt') self.rpt.write_scheme_summary(result, open(fname, 'w')) return result
class Analysis(object): """Performs the analysis and collects the results""" def __init__(self, cfg, force_restart, threads): the_config.validate() # TODO: Remove -- put this all into "options" if threads == -1: threads = threadpool.get_cpu_count() self.threads = threads # TODO: Move these to the config validate and prepare log.info("Beginning Analysis") self.process_restart(force_restart) # Make some folders for the analysis the_config.make_output_folders() the_config.database = Database(the_config) # Check for old analyses to see if we can use the old data the_config.check_for_old_config() # TODO: This is going to be in "Prepare" self.make_alignment(cfg.alignment_path) self.make_tree(cfg.user_tree_topology_path) # We need this to block the threads for critical stuff self.lock = threading.Condition(threading.Lock()) # Store the result in here self.results = results.AnalysisResults(the_config.model_selection) def process_restart(self, force_restart): if force_restart: # Remove everything if os.path.exists(the_config.output_path): log.warning("Deleting all previous workings in '%s'" % the_config.output_path) shutil.rmtree(the_config.output_path) else: # Remove the schemes folder, and clean out the phylofiles folder if os.path.exists(the_config.schemes_path): log.debug("Removing files in '%s'" % the_config.schemes_path) shutil.rmtree(the_config.schemes_path) if os.path.exists(the_config.phylofiles_path): log.debug("Removing files in '%s'" % the_config.phylofiles_path) shutil.rmtree(the_config.phylofiles_path) def analyse(self): try: self.do_analysis() finally: # TODO: Not really the right place for it? the_config.database.close() return self.results def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # TODO REMOVE -- this should be part of the checking procedure # We start by copying the alignment self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error("""Alignment file has changed since previous run. You need to use the force-restart option.""") raise AnalysisError compare = lambda x, y: collections.Counter( x) == collections.Counter(y) if not compare(old_align.species, self.alignment.species): log.error( """Species names in alignment have changed since previous run. You need to use the force-restart option.""") raise AnalysisError else: self.alignment.write(self.alignment_path) def need_new_tree(self, tree_path): if os.path.exists(tree_path): if ';' in open(tree_path).read(): log.info("Starting tree file found.") redo_tree = False else: log.info("""Starting tree file found but it is incomplete. Re-estimating""") redo_tree = True else: log.info("Starting tree will be estimated from the data.") redo_tree = True return redo_tree def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset_ops.merge_subsets( the_config.user_subsets) self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join(the_config.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Check the full subset against the alignment subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config) # We start by copying the alignment self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy') # Now check for the tree tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path): log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(the_config.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s" % user_path) topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy') util.dupfile(user_path, topology_path) need_bl = True elif the_config.no_ml_tree == True: log.debug("didn't find tree at %s, making a new one" % tree_path) topology_path = the_config.processor.make_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras) need_bl = True elif the_config.no_ml_tree == False: log.debug( "didn't find tree at %s, making an ML tree with RAxML" % tree_path) tree_scheme = scheme.create_scheme( the_config, "tree_scheme", range(len(the_config.user_subsets))) topology_path = raxml.make_ml_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads) # here we copy the ML tree topology so it can be used with PhyML too # TODO: this is a hack, and it would be better to decide on a universal # name for the different types of tree we might have. phyml_tree = os.path.join( os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt") copyfile(topology_path, phyml_tree) need_bl = False if need_bl == True: # Now estimate branch lengths tree_path = the_config.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, the_config.datatype, the_config.cmdline_extras) self.tree_path = tree_path log.debug("Starting tree with branch lengths is here: %s" % self.tree_path) def run_task(self, model_name, sub): # This bit should run in parallel (forking the processor) try: the_config.processor.analyse(model_name, sub.alignment_path, self.tree_path, the_config.branchlengths, the_config.cmdline_extras) fabricate = False except ExternalProgramError: if not the_config.suppress_errors: # In the Kmeans algorithm we suppress errors and "fabricate" # subsets (we assume the error is because the subset is too # small for analysis) raise # If it is kmeans we assume that the error is because the subset # is too small or unanalysable, so we fabricate it log.debug("New subset could not be analysed. It will be merged " "at the end of the analysis") fabricate = True # Not entirely sure that WE NEED to block here, but it is safer to do # It shouldn't hold things up toooo long... self.lock.acquire() try: if fabricate: sub.fabricate_model_result(the_config, model_name) else: sub.parse_model_result(the_config, model_name) # Try finalising, then the result will get written out earlier... sub.finalise(the_config) finally: self.lock.release() def add_tasks_for_sub(self, tasks, sub): for m in sub.models_to_process: tasks.append((self.run_task, (m, sub))) def run_concurrent(self, tasks): for func, args in tasks: log.debug("About to analyse subset %s", args[1].name) func(*args) def run_threaded(self, tasks): if not tasks: return pool = threadpool.Pool(tasks, self.threads) pool.join() def analyse_list_of_subsets( self, all_subsets, ): # get a whole list of subsets analysed in parallel # analyse bigger subsets first, for efficiency all_subsets.sort(key=lambda x: 1.0 / float(len(x.columns))) # chunk the list into blocks of ~1000 tasks # in empirical testing, this speeds things up lot # though we are not entirely sure why... n = 1000 n = int(n / len(the_config.models)) if (n < 1): n = 1 # seems unlikely... log.debug("chunk size (in number of subsets) = %d", n) subset_chunks = [ all_subsets[i:i + n] for i in xrange(0, len(all_subsets), n) ] for subsets in subset_chunks: # prepare the list of tasks tasks = [] for sub in subsets: if sub.is_done: pass elif sub.is_prepared: self.add_tasks_for_sub(tasks, sub) else: sub.prepare(the_config, self.alignment) self.add_tasks_for_sub(tasks, sub) if tasks: # Now do the analysis if self.threads == 1: self.run_concurrent(tasks) else: self.run_threaded(tasks) # Now see if we're done for sub in all_subsets: # ALL subsets should already be finalised in the task. We just # check again here if not sub.finalise(the_config): log.error("Failed to run models %s; not sure why" % ", " "".join(list(sub.models_not_done))) raise AnalysisError def analyse_scheme(self, sch): # Progress the_config.progress.next_scheme() # analyse the subsets in the scheme that aren't done # NB for most schemes we will have all subsets done, so this saves time not_done = [] for sub in sch: if sub.is_done == False: not_done.append(sub) if not_done: self.analyse_list_of_subsets(not_done) # AIC needs the number of sequences number_of_seq = len(self.alignment.species) result = scheme.SchemeResult(sch, number_of_seq, the_config.branchlengths, the_config.model_selection) self.results.add_scheme_result(sch, result) return result
class Analysis(object): """Performs the analysis and collects the results""" def __init__(self, cfg, force_restart=False, threads=-1): cfg.validate() self.cfg = cfg self.threads = threads self.results = results.AnalysisResults(self.cfg.model_selection) log.info("Beginning Analysis") self.process_restart(force_restart) # Check for old analyses to see if we can use the old data self.cfg.check_for_old_config() # Make some folders for the analysis self.cfg.make_output_folders() self.make_alignment(cfg.alignment_path) self.make_tree(cfg.user_tree_topology_path) # We need this to block the threads for critical stuff self.lock = threading.Condition(threading.Lock()) def process_restart(self, force_restart): if force_restart: # Remove everything if os.path.exists(self.cfg.output_path): log.warning("Deleting all previous workings in '%s'", self.cfg.output_path) shutil.rmtree(self.cfg.output_path) else: # Just remove the schemes folder if os.path.exists(self.cfg.schemes_path): log.info( "Removing Schemes in '%s' (they will be recalculated from existing subset data)", self.cfg.schemes_path) shutil.rmtree(self.cfg.schemes_path) def analyse(self): self.do_analysis() return self.results def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error( "Alignment file has changed since previous run. You need to use the force-restart option." ) raise AnalysisError else: self.alignment.write(self.alignment_path) def need_new_tree(self, tree_path): if os.path.exists(tree_path): if ';' in open(tree_path).read(): log.info("Starting tree file found.") redo_tree = False else: log.info( "Starting tree file found but incomplete. Re-estimating") redo_tree = True else: log.info("No starting tree file found.") redo_tree = True return redo_tree def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset.Subset(*list(self.cfg.partitions)) self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Now we've written this alignment, we need to lock everything in # place, no more adding partitions, or changing them from now on. self.cfg.partitions.check_against_alignment(self.alignment) self.cfg.partitions.finalise() # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') # Now check for the tree tree_path = self.cfg.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path) == True: log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(self.cfg.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s", user_path) topology_path = os.path.join(self.cfg.start_tree_path, 'user_topology.phy') self.cfg.processor.dupfile(user_path, topology_path) else: log.debug("didn't find tree at %s, making a new one" % tree_path) topology_path = self.cfg.processor.make_topology( self.filtered_alignment_path, self.cfg.datatype, self.cfg.cmdline_extras) # Now estimate branch lengths tree_path = self.cfg.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, self.cfg.datatype, self.cfg.cmdline_extras) self.tree_path = tree_path log.info("Starting tree with branch lengths is here: %s", self.tree_path) def run_task(self, m, sub): # This bit should run in parallel (forking the processor) self.cfg.processor.analyse(m, sub.alignment_path, self.tree_path, self.cfg.branchlengths, self.cfg.cmdline_extras) # Not entirely sure that WE NEED to block here, but it is safer to do # It shouldn't hold things up toooo long... self.lock.acquire() try: sub.parse_model_result(self.cfg, m) # Try finalising, then the result will get written out earlier... sub.finalise(self.cfg) finally: self.lock.release() def add_tasks_for_sub(self, tasks, sub): for m in sub.models_to_process: tasks.append((self.run_task, (m, sub))) def run_concurrent(self, tasks): for func, args in tasks: func(*args) def run_threaded(self, tasks): if not tasks: return pool = threadpool.Pool(tasks, self.threads) pool.join() def analyse_scheme(self, sch): # Progress self.cfg.progress.next_scheme() # Prepare by reading everything in first tasks = [] for sub in sch: sub.prepare(self.cfg, self.alignment) self.add_tasks_for_sub(tasks, sub) # Now do the analysis if self.threads == 1: self.run_concurrent(tasks) else: self.run_threaded(tasks) # Now see if we're done for sub in sch: # ALL subsets should already be finalised in the task. We just # check again here if not sub.finalise(self.cfg): log.error("Failed to run models %s; not sure why", ", ".join(list(sub.models_to_do))) raise AnalysisError # AIC needs the number of sequences number_of_seq = len(self.alignment.species) result = scheme.SchemeResult(sch, number_of_seq, self.cfg.branchlengths, self.cfg.model_selection) self.results.add_scheme_result(sch, result) return result
class Analysis(object): """Performs the analysis and collects the results""" def __init__(self, cfg, force_restart=False, threads=-1): cfg.validate() self.cfg = cfg self.threads = threads self.results = results.AnalysisResults(self.cfg.model_selection) log.info("Beginning Analysis") self.process_restart(force_restart) # Check for old analyses to see if we can use the old data self.cfg.check_for_old_config() # Make some folders for the analysis self.cfg.make_output_folders() self.make_alignment(cfg.alignment_path) self.make_tree(cfg.user_tree_topology_path) # We need this to block the threads for critical stuff self.lock = threading.Condition(threading.Lock()) def process_restart(self, force_restart): if force_restart: # Remove everything if os.path.exists(self.cfg.output_path): log.warning("Deleting all previous workings in '%s'", self.cfg.output_path) shutil.rmtree(self.cfg.output_path) else: # Just remove the schemes folder if os.path.exists(self.cfg.schemes_path): log.info("Removing Schemes in '%s' (they will be recalculated from existing subset data)", self.cfg.schemes_path) shutil.rmtree(self.cfg.schemes_path) def analyse(self): self.do_analysis() return self.results def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error("Alignment file has changed since previous run. You need to use the force-restart option.") raise AnalysisError else: self.alignment.write(self.alignment_path) def need_new_tree(self, tree_path): if os.path.exists(tree_path): if ';' in open(tree_path).read(): log.info("Starting tree file found.") redo_tree = False else: log.info("Starting tree file found but incomplete. Re-estimating") redo_tree = True else: log.info("No starting tree file found.") redo_tree = True return redo_tree def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset.Subset(*list(self.cfg.partitions)) self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Now we've written this alignment, we need to lock everything in # place, no more adding partitions, or changing them from now on. self.cfg.partitions.check_against_alignment(self.alignment) self.cfg.partitions.finalise() # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') # Now check for the tree tree_path = self.cfg.processor.make_tree_path(self.filtered_alignment_path) if self.need_new_tree(tree_path) == True: log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(self.cfg.start_tree_path, keep = ["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s", user_path) topology_path = os.path.join(self.cfg.start_tree_path, 'user_topology.phy') self.cfg.processor.dupfile(user_path, topology_path) else: log.debug( "didn't find tree at %s, making a new one" % tree_path) topology_path = self.cfg.processor.make_topology( self.filtered_alignment_path, self.cfg.datatype, self.cfg.cmdline_extras) # Now estimate branch lengths tree_path = self.cfg.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, self.cfg.datatype, self.cfg.cmdline_extras) self.tree_path = tree_path log.info("Starting tree with branch lengths is here: %s", self.tree_path) def run_task(self, m, sub): # This bit should run in parallel (forking the processor) self.cfg.processor.analyse( m, sub.alignment_path, self.tree_path, self.cfg.branchlengths, self.cfg.cmdline_extras ) # Not entirely sure that WE NEED to block here, but it is safer to do # It shouldn't hold things up toooo long... self.lock.acquire() try: sub.parse_model_result(self.cfg, m) # Try finalising, then the result will get written out earlier... sub.finalise(self.cfg) finally: self.lock.release() def add_tasks_for_sub(self, tasks, sub): for m in sub.models_to_process: tasks.append((self.run_task, (m, sub))) def run_concurrent(self, tasks): for func, args in tasks: func(*args) def run_threaded(self, tasks): if not tasks: return pool = threadpool.Pool(tasks, self.threads) pool.join() def analyse_scheme(self, sch): # Progress self.cfg.progress.next_scheme() # Prepare by reading everything in first tasks = [] for sub in sch: sub.prepare(self.cfg, self.alignment) self.add_tasks_for_sub(tasks, sub) # Now do the analysis if self.threads == 1: self.run_concurrent(tasks) else: self.run_threaded(tasks) # Now see if we're done for sub in sch: # ALL subsets should already be finalised in the task. We just # check again here if not sub.finalise(self.cfg): log.error("Failed to run models %s; not sure why", ", ".join(list(sub.models_to_do))) raise AnalysisError # AIC needs the number of sequences number_of_seq = len(self.alignment.species) result = scheme.SchemeResult(sch, number_of_seq, self.cfg.branchlengths, self.cfg.model_selection) self.results.add_scheme_result(sch, result) return result
class Analysis(object): """Performs the analysis and collects the results""" def __init__(self, cfg, force_restart, threads): the_config.validate() # TODO: Remove -- put this all into "options" if threads == -1: threads = threadpool.get_cpu_count() self.threads = threads # TODO: Move these to the config validate and prepare log.info("Beginning Analysis") self.process_restart(force_restart) # Make some folders for the analysis the_config.make_output_folders() the_config.database = Database(the_config) # Check for old analyses to see if we can use the old data the_config.check_for_old_config() # TODO: This is going to be in "Prepare" self.make_alignment(cfg.alignment_path) self.make_tree(cfg.user_tree_topology_path) # We need this to block the threads for critical stuff self.lock = threading.Condition(threading.Lock()) # Store the result in here self.results = results.AnalysisResults(the_config.model_selection) def process_restart(self, force_restart): if force_restart: # Remove everything if os.path.exists(the_config.output_path): log.warning("Deleting all previous workings in '%s'" % the_config.output_path) shutil.rmtree(the_config.output_path) else: # Remove the schemes folder, and clean out the phylofiles folder if os.path.exists(the_config.schemes_path): log.debug("Removing files in '%s'" % the_config.schemes_path) shutil.rmtree(the_config.schemes_path) if os.path.exists(the_config.phylofiles_path): log.debug("Removing files in '%s'" % the_config.phylofiles_path) shutil.rmtree(the_config.phylofiles_path) def analyse(self): try: self.do_analysis() finally: # TODO: Not really the right place for it? the_config.database.close() return self.results def make_alignment(self, source_alignment_path): # Make the alignment self.alignment = Alignment() self.alignment.read(source_alignment_path) # TODO REMOVE -- this should be part of the checking procedure # We start by copying the alignment self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy') if os.path.exists(self.alignment_path): # Make sure it is the same old_align = Alignment() old_align.read(self.alignment_path) if not old_align.same_as(self.alignment): log.error("""Alignment file has changed since previous run. You need to use the force-restart option.""") raise AnalysisError compare = lambda x, y: collections.Counter(x) == collections.Counter(y) if not compare(old_align.species, self.alignment.species): log.error("""Species names in alignment have changed since previous run. You need to use the force-restart option.""") raise AnalysisError else: self.alignment.write(self.alignment_path) def need_new_tree(self, tree_path): if os.path.exists(tree_path): if ';' in open(tree_path).read(): log.info("Starting tree file found.") redo_tree = False else: log.info("""Starting tree file found but it is incomplete. Re-estimating""") redo_tree = True else: log.info("Starting tree will be estimated from the data.") redo_tree = True return redo_tree def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset_ops.merge_subsets(the_config.user_subsets) self.filtered_alignment = SubsetAlignment( self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join( the_config.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Check the full subset against the alignment subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config) # We start by copying the alignment self.alignment_path = os.path.join( the_config.start_tree_path, 'source.phy') # Now check for the tree tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path): log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(the_config.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s" % user_path) topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy') util.dupfile(user_path, topology_path) need_bl = True elif the_config.no_ml_tree == True: log.debug( "didn't find tree at %s, making a new one" % tree_path) topology_path = the_config.processor.make_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras) need_bl = True elif the_config.no_ml_tree == False: log.debug( "didn't find tree at %s, making an ML tree with RAxML" % tree_path) tree_scheme = scheme.create_scheme( the_config, "tree_scheme", range(len(the_config.user_subsets))) topology_path = raxml.make_ml_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads) # here we copy the ML tree topology so it can be used with PhyML too # TODO: this is a hack, and it would be better to decide on a universal # name for the different types of tree we might have. phyml_tree = os.path.join(os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt") copyfile(topology_path, phyml_tree) need_bl = False if need_bl == True: # Now estimate branch lengths tree_path = the_config.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, the_config.datatype, the_config.cmdline_extras) self.tree_path = tree_path log.debug("Starting tree with branch lengths is here: %s" % self.tree_path) def run_task(self, model_name, sub): # This bit should run in parallel (forking the processor) try: the_config.processor.analyse( model_name, sub.alignment_path, self.tree_path, the_config.branchlengths, the_config.cmdline_extras ) fabricate = False except ExternalProgramError: if not the_config.suppress_errors: # In the Kmeans algorithm we suppress errors and "fabricate" # subsets (we assume the error is because the subset is too # small for analysis) raise # If it is kmeans we assume that the error is because the subset # is too small or unanalysable, so we fabricate it log.debug("New subset could not be analysed. It will be merged " "at the end of the analysis") fabricate = True # Not entirely sure that WE NEED to block here, but it is safer to do # It shouldn't hold things up toooo long... self.lock.acquire() try: if fabricate: sub.fabricate_model_result(the_config, model_name) else: sub.parse_model_result(the_config, model_name) # Try finalising, then the result will get written out earlier... sub.finalise(the_config) finally: self.lock.release() def add_tasks_for_sub(self, tasks, sub): for m in sub.models_to_process: tasks.append((self.run_task, (m, sub))) def run_concurrent(self, tasks): for func, args in tasks: log.debug("About to analyse subset %s", args[1].name) func(*args) def run_threaded(self, tasks): if not tasks: return pool = threadpool.Pool(tasks, self.threads) pool.join() def analyse_list_of_subsets(self, all_subsets, ): # get a whole list of subsets analysed in parallel # analyse bigger subsets first, for efficiency all_subsets.sort(key = lambda x: 1.0/float(len(x.columns))) # chunk the list into blocks of ~1000 tasks # in empirical testing, this speeds things up lot # though we are not entirely sure why... n = 1000 n = int(n / len(the_config.models)) if(n<1): n=1 # seems unlikely... log.debug("chunk size (in number of subsets) = %d", n) subset_chunks = [all_subsets[i:i + n] for i in xrange(0, len(all_subsets), n)] for subsets in subset_chunks: # prepare the list of tasks tasks = [] for sub in subsets: if sub.is_done: pass elif sub.is_prepared: self.add_tasks_for_sub(tasks, sub) else: sub.prepare(the_config, self.alignment) self.add_tasks_for_sub(tasks, sub) if tasks: # Now do the analysis if self.threads == 1: self.run_concurrent(tasks) else: self.run_threaded(tasks) # Now see if we're done for sub in all_subsets: # ALL subsets should already be finalised in the task. We just # check again here if not sub.finalise(the_config): log.error("Failed to run models %s; not sure why" % ", " "".join(list(sub.models_not_done))) raise AnalysisError def analyse_scheme(self, sch): # Progress the_config.progress.next_scheme() # analyse the subsets in the scheme that aren't done # NB for most schemes we will have all subsets done, so this saves time not_done = [] for sub in sch: if sub.is_done == False: not_done.append(sub) if not_done: self.analyse_list_of_subsets(not_done) # AIC needs the number of sequences number_of_seq = len(self.alignment.species) result = scheme.SchemeResult(sch, number_of_seq, the_config.branchlengths, the_config.model_selection) self.results.add_scheme_result(sch, result) return result