def clean_scheme(self, start_scheme): # Here we look for and fix up subsets that are too small or don't have all states keep_going = 1 merges = 0 if keep_going == 1: with logtools.indented( log, "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***" % start_scheme.name): while keep_going > 0: subsets = [s for s in start_scheme.subsets] # sort the subsets, to keep results consistent over re-runs subsets.sort(key=lambda x: 1.0 / float(len(x.columns))) # run through all subsets for i, sub in enumerate(subsets): found = 0 state_problems = self.alignment.check_state_probs( sub, the_config) if (len(sub.columns) < the_config.min_subset_size or state_problems == True): # merge that subset with nearest neighbour new_pair = neighbour.get_closest_subset( sub, subsets, the_config) log.info( "Subset '%s' will be merged with subset '%s'" % (new_pair[0].name, new_pair[1].name)) new_pair_merged = subset_ops.merge_subsets( new_pair) start_scheme = neighbour.make_clustered_scheme( start_scheme, "cleaned_scheme", new_pair, new_pair_merged, the_config) the_config.progress.begin(1, 1) self.analyse_scheme(start_scheme) subsets = [s for s in start_scheme.subsets] merges = merges + 1 found = 1 break # if we got to here, there were no subsets to merge if found == 0: keep_going = 0 if len(subsets) == 1: log.error( "The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again" ) raise AnalysisError log.info( "%d subsets merged because of --min-subset-size and/or --all-states settings" % merges) return (start_scheme)
def clean_scheme(self, start_scheme): # Here we look for and fix up subsets that are too small or don't have all states keep_going = 1 merges = 0 if keep_going == 1: with logtools.indented(log, "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***" %start_scheme.name): while keep_going > 0: subsets = [s for s in start_scheme.subsets] # sort the subsets, to keep results consistent over re-runs subsets.sort(key = lambda x: 1.0/float(len(x.columns))) # run through all subsets for i, sub in enumerate(subsets): found = 0 state_problems = self.alignment.check_state_probs(sub, the_config) if ( len(sub.columns) < the_config.min_subset_size or state_problems == True ): # merge that subset with nearest neighbour new_pair = neighbour.get_closest_subset(sub, subsets, the_config) log.info("Subset '%s' will be merged with subset '%s'" %(new_pair[0].name, new_pair[1].name)) new_pair_merged = subset_ops.merge_subsets(new_pair) start_scheme = neighbour.make_clustered_scheme( start_scheme, "cleaned_scheme", new_pair, new_pair_merged, the_config) the_config.progress.begin(1, 1) self.analyse_scheme(start_scheme) subsets = [s for s in start_scheme.subsets] merges = merges + 1 found = 1 break # if we got to here, there were no subsets to merge if found == 0: keep_going = 0 if len(subsets) == 1: log.error("The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again") raise AnalysisError log.info("%d subsets merged because of --min-subset-size and/or --all-states settings" % merges) return(start_scheme)
def do_analysis(self): log.info("Performing greediest analysis") stop_at = self.cfg.greediest_percent * 0.01 model_selection = self.cfg.model_selection partnum = len(self.cfg.partitions) scheme_count = submodels.count_greediest_schemes(partnum, self.cfg.greediest_percent) subset_count = submodels.count_greediest_subsets(partnum, self.cfg.greediest_percent) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Start by remembering that we analysed the starting scheme subset_counter = 1 step = 1 while True: log.info("***Greediest algorithm step %d of %d***" % (step, partnum - 1)) name_prefix = "step_%d" % (step) # Get a list of all possible lumpings of the best_scheme, ordered # according to the clustering weights lumped_subsets = neighbour.get_ranked_clustered_subsets( start_scheme, self.cfg) # reduce the size of the lumped subsets to greediest_percent long cutoff = int(math.ceil(len(lumped_subsets)*stop_at)) #round up to stop zeros lumped_subsets = lumped_subsets[:cutoff] # Now analyse the lumped schemes lumpings_done = 0 old_best_score = self.results.best_score for subset_grouping in lumped_subsets: scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1) lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.info("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) lumpings_done += 1 if self.results.best_score != old_best_score: log.info("Analysed %.1f percent of the schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.greediest_percent, self.cfg.model_selection, (self.results.best_score - old_best_score)) #write out the best scheme self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed %.1f percent of the schemes for this step and found no schemes " "that improve the score, stopping" , self.cfg.greediest_percent) break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Greediest algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) txt = "Best scheme according to Greediest algorithm, analysed with %s" % self.cfg.model_selection self.cfg.reporter.write_best_scheme(txt, self.results)
def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented( log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix( subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int( math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix < 0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair( c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix( c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info( "The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info( "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim * dim) - dim)) / 2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info( "The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) if the_config.cluster_max == -987654321: the_config.cluster_max = max([1000, (10 * len(the_config.user_subsets))]) log.info("Set rcluster-max to %d" %the_config.cluster_max) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented(log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix(subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo)>0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" %(step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change>=0: log.info("Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix<0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name)) log.debug("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info("Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim*dim)-dim))/2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo)>0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" %(step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change>=0: log.info("Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name)) log.info("The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): log.info("Performing relaxed clustering analysis") stop_at = self.cfg.cluster_percent * 0.01 model_selection = self.cfg.model_selection partnum = len(self.cfg.partitions) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, self.cfg.cluster_percent) subset_count = submodels.count_relaxed_clustering_subsets( partnum, self.cfg.cluster_percent) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) self.cfg.reporter.write_scheme_summary(self.results.best_scheme, self.results.best_result) # Start by remembering that we analysed the starting scheme subset_counter = 1 step = 1 while True: log.info("***Relaxed clustering algorithm step %d of %d***" % (step, partnum - 1)) name_prefix = "step_%d" % (step) # Get a list of all possible lumpings of the best_scheme, ordered # according to the clustering weights lumped_subsets = neighbour.get_ranked_clustered_subsets( start_scheme, self.cfg) # reduce the size of the lumped subsets to cluster_percent long cutoff = int(math.ceil(len(lumped_subsets) * stop_at)) #round up to stop zeros lumped_subsets = lumped_subsets[:cutoff] # Now analyse the lumped schemes lumpings_done = 0 old_best_score = self.results.best_score for subset_grouping in lumped_subsets: scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1) lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score - old_best_score)) lumpings_done += 1 if self.results.best_score != old_best_score: log.info( "Analysed %.1f percent of the schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.cluster_percent, self.cfg.model_selection, (self.results.best_score - old_best_score)) #write out the best scheme self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info( "Analysed %.1f percent of the schemes for this step and found no schemes " "that improve the score, stopping", self.cfg.cluster_percent) break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): # Copied and pasted from greedy analysis partnum = len(self.cfg.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) old_score = self.analyse_scheme(start_scheme) # Get first scheme best_scheme = start_scheme subset_index = 0 all_subsets = list(best_scheme.subsets) processor = self.cfg.processor alignment_path = self.filtered_alignment_path tree_path = processor.make_tree_path(alignment_path) while subset_index < len(all_subsets): current_subset = all_subsets[subset_index] split_subsets = kmeans.kmeans_split_subset(self.cfg, self.alignment, current_subset, tree_path) if split_subsets == 1: subset_index += 1 else: # Take a copy updated_subsets = all_subsets[:] # Replace the current one with the split one # Google "slice assignments" # This list is the key to avoiding recursion. It expands to contain # all of the split subsets by replacing them with the split ones updated_subsets[subset_index:subset_index+1] = split_subsets test_scheme = scheme.Scheme(self.cfg, "Current Scheme", updated_subsets) try: best_result = self.analyse_scheme(best_scheme) new_result = self.analyse_scheme(test_scheme) log.info("Current best score is: " + str(best_result)) log.info("Current new score is: " + str(new_result)) if new_result.score < best_result.score: log.info("New score " + str(subset_index) + " is better and will be set to best score") best_scheme = test_scheme # Change this to the one with split subsets in it. Note that # the subset_index now points a NEW subset, one that was split all_subsets = updated_subsets else: # Move to the next subset in the all_subsets list subset_index += 1 # In PhyML or RAxML, it is likely because of no alignment patterns, # catch that and move to the next subset without splitting. except PhylogenyProgramError: log.info("Phylogeny program generated an error so this subset was not split, see error above") subset_index += 1 # Now start the Greedy Analysis: need to figure out how to make it go through more # than one scheme... start_scheme = best_scheme partnum = len(start_scheme.subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) start_description = range(partnum) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) old_best_score = self.results.best_score # Get an iterable of all possible pairs of subsets in best_scheme lumped_subsets = itertools.combinations(start_scheme.subsets, 2) for subset_grouping in lumped_subsets: scheme_name = cur_s lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) cur_s += 1 if self.results.best_score != old_best_score: log.info("Analysed all schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.model_selection, (self.results.best_score - old_best_score)) self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed all schemes for this step and found no schemes " "that improve the score, stopping") break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' log.info("Performing greedy analysis") partnum = len(self.cfg.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) old_best_score = self.results.best_score # Get an iterable of all possible pairs of subsets in best_scheme lumped_subsets = itertools.combinations(start_scheme.subsets, 2) for subset_grouping in lumped_subsets: scheme_name = cur_s lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) cur_s += 1 if self.results.best_score != old_best_score: log.info("Analysed all schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.model_selection, (self.results.best_score - old_best_score)) self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed all schemes for this step and found no schemes " "that improve the score, stopping") break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)