def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented( log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix( subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int( math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix < 0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair( c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix( c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info( "The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info( "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): log.info("Performing relaxed clustering analysis") stop_at = self.cfg.cluster_percent * 0.01 model_selection = self.cfg.model_selection partnum = len(self.cfg.partitions) scheme_count = submodels.count_relaxed_clustering_schemes(partnum, self.cfg.cluster_percent) subset_count = submodels.count_relaxed_clustering_subsets(partnum, self.cfg.cluster_percent) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Start by remembering that we analysed the starting scheme subset_counter = 1 step = 1 while True: log.info("***Relaxed clustering algorithm step %d of %d***" % (step, partnum - 1)) name_prefix = "step_%d" % (step) # Get a list of all possible lumpings of the best_scheme, ordered # according to the clustering weights lumped_subsets = neighbour.get_ranked_clustered_subsets( start_scheme, self.cfg) # reduce the size of the lumped subsets to cluster_percent long cutoff = int(math.ceil(len(lumped_subsets)*stop_at)) #round up to stop zeros lumped_subsets = lumped_subsets[:cutoff] # Now analyse the lumped schemes lumpings_done = 0 old_best_score = self.results.best_score for subset_grouping in lumped_subsets: scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1) lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) lumpings_done += 1 if self.results.best_score != old_best_score: log.info("Analysed %.1f percent of the schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.cluster_percent, self.cfg.model_selection, (self.results.best_score - old_best_score)) #write out the best scheme self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed %.1f percent of the schemes for this step and found no schemes " "that improve the score, stopping" , self.cfg.cluster_percent) break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) if the_config.cluster_max == -987654321: the_config.cluster_max = max([1000, (10 * len(the_config.user_subsets))]) log.info("Set rcluster-max to %d" %the_config.cluster_max) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented(log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix(subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo)>0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" %(step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change>=0: log.info("Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix<0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name)) log.debug("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info("Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): log.info("Performing relaxed clustering analysis") stop_at = self.cfg.cluster_percent * 0.01 model_selection = self.cfg.model_selection partnum = len(self.cfg.partitions) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, self.cfg.cluster_percent) subset_count = submodels.count_relaxed_clustering_subsets( partnum, self.cfg.cluster_percent) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) self.cfg.reporter.write_scheme_summary(self.results.best_scheme, self.results.best_result) # Start by remembering that we analysed the starting scheme subset_counter = 1 step = 1 while True: log.info("***Relaxed clustering algorithm step %d of %d***" % (step, partnum - 1)) name_prefix = "step_%d" % (step) # Get a list of all possible lumpings of the best_scheme, ordered # according to the clustering weights lumped_subsets = neighbour.get_ranked_clustered_subsets( start_scheme, self.cfg) # reduce the size of the lumped subsets to cluster_percent long cutoff = int(math.ceil(len(lumped_subsets) * stop_at)) #round up to stop zeros lumped_subsets = lumped_subsets[:cutoff] # Now analyse the lumped schemes lumpings_done = 0 old_best_score = self.results.best_score for subset_grouping in lumped_subsets: scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1) lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score - old_best_score)) lumpings_done += 1 if self.results.best_score != old_best_score: log.info( "Analysed %.1f percent of the schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.cluster_percent, self.cfg.model_selection, (self.results.best_score - old_best_score)) #write out the best scheme self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info( "Analysed %.1f percent of the schemes for this step and found no schemes " "that improve the score, stopping", self.cfg.cluster_percent) break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)