def one_kmeans_step(self, start_subsets, step, tree_path): name_prefix = "step_%d" % (step) # 1. Make split subsets with logtools.indented(log, "Splitting subsets using k-means"): split_subs = self.split_subsets(start_subsets, tree_path) # 2. Analyse split subsets (this to take advantage of parallelisation) subs = [] # make a list from the dictionary for vals in split_subs.values(): subs.extend(vals) log.debug("%d subsets successfully split" % (len(subs) - len(start_subsets))) with logtools.indented( log, "Calculating scores of all new subsets that can be analysed"): self.analyse_list_of_subsets(subs) # 3. Build new list of subsets new_scheme_subs = self.build_new_subset_list( name_prefix, split_subs, start_subsets) # 4. Are we done yet? if len(new_scheme_subs) == len(list(start_subsets)): log.info( """Could not improve %s score. Kmeans algorithm finished.""" % (the_config.model_selection)) done = True else: n_splits = len(new_scheme_subs) - len(start_subsets) if n_splits > 1: t = 'subsets' else: t = 'subset' log.info("""The %s score of %d %s improved when split""" % (the_config.model_selection, n_splits, t)) start_subsets = new_scheme_subs done = False return done, start_subsets
def clean_scheme(self, start_scheme): # Here we look for and fix up subsets that are too small or don't have all states keep_going = 1 merges = 0 if keep_going == 1: with logtools.indented( log, "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***" % start_scheme.name): while keep_going > 0: subsets = [s for s in start_scheme.subsets] # sort the subsets, to keep results consistent over re-runs subsets.sort(key=lambda x: 1.0 / float(len(x.columns))) # run through all subsets for i, sub in enumerate(subsets): found = 0 state_problems = self.alignment.check_state_probs( sub, the_config) if (len(sub.columns) < the_config.min_subset_size or state_problems == True): # merge that subset with nearest neighbour new_pair = neighbour.get_closest_subset( sub, subsets, the_config) log.info( "Subset '%s' will be merged with subset '%s'" % (new_pair[0].name, new_pair[1].name)) new_pair_merged = subset_ops.merge_subsets( new_pair) start_scheme = neighbour.make_clustered_scheme( start_scheme, "cleaned_scheme", new_pair, new_pair_merged, the_config) the_config.progress.begin(1, 1) self.analyse_scheme(start_scheme) subsets = [s for s in start_scheme.subsets] merges = merges + 1 found = 1 break # if we got to here, there were no subsets to merge if found == 0: keep_going = 0 if len(subsets) == 1: log.error( "The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again" ) raise AnalysisError log.info( "%d subsets merged because of --min-subset-size and/or --all-states settings" % merges) return (start_scheme)
def do_analysis(self): '''A kmeans algorithm for heuristic partitioning searches''' start_result, start_scheme, tree_path = self.setup() step = 0 start_subsets = list( start_scheme.subsets) # we only work on lists of subsets self.analyse_list_of_subsets(start_subsets) # now we suppress ExternalProgramError for the rest of the algorithm the_config.suppress_errors = True for s in start_subsets: if s.fabricated: log.error("""One or more of your starting datablocks could not be analysed. Please check your data and try again. One way to fix this is to join your small datablocks together into larger datablocks""") raise AnalysisError while True: step += 1 with logtools.indented(log, "***k-means algorithm step %d***" % step): done, start_subsets = self.one_kmeans_step( start_subsets, step, tree_path) if done: break # Ok, we're done. we just need deal with fabricated subsets final_scheme = self.finalise_fabrication(start_subsets, step) # Finally, for krmeans, we put the invariant sites back with their # nearest variable neighbours if the_config.search == 'krmeans': log.info("Reassigning invariant sites for krmeans algorithm") # the definition of krmeans is that we reassign the zero entropies final_subsets = self.reassign_invariant_sites(final_scheme.subsets) final_scheme = scheme.Scheme(the_config, "final_scheme_reassigned", final_subsets) log.info("Analysing final scheme") final_result = self.analyse_scheme(final_scheme) self.report(step) if not the_config.quick: the_config.reporter.write_scheme_summary(final_scheme, final_result) return (final_scheme)
def setup(self): log.warning( "Warning as of April 2016: We have noticed that the kmeans \ algorithm does not perform well on some simulated datasets. \ We are working on investigating and addressing this \ but in the mean time we suggest being very cautious about using \ this algorithm. At the very least, you should try other approaches \ (e.g. partitioning by locus), and investigate your answers carefully \ (both the trees and the partitioning schemes). If you have any \ questions, please get in touch on the google group. Note that this \ warning does not apply to cases where you are using models that have \ an ascertainment bias for datasets that include only variable sites \ as is often the case with morphological analyses.") # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", start_description) site_max = sum([len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." % (the_config.min_subset_size, site_max)) raise AnalysisError with logtools.indented( log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary( start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology': log.error("You have selected kmeans and tiger \ rates. This is an unsupported option for anything except \ morphological data. The kmeans algorithm \ now works with entropies, not TIGER rates.") raise AnalysisError return start_result, start_scheme, tree_path
def do_analysis(self): '''A kmeans algorithm for heuristic partitioning searches''' start_result, start_scheme, tree_path = self.setup() step = 0 start_subsets = list( start_scheme.subsets) # we only work on lists of subsets self.analyse_list_of_subsets(start_subsets) # now we suppress ExternalProgramError for the rest of the algorithm the_config.suppress_errors = True for s in start_subsets: if s.fabricated: log.error("""One or more of your starting datablocks could not be analysed. Please check your data and try again. One way to fix this is to join your small datablocks together into larger datablocks""") raise AnalysisError while True: step += 1 with logtools.indented(log, "***k-means algorithm step %d***" % step): done, start_subsets = self.one_kmeans_step( start_subsets, step, tree_path) if done: break # Ok, we're done. we just need deal with fabricated subsets final_scheme = self.finalise_fabrication(start_subsets, step) log.info("Analysing final scheme") final_result = self.analyse_scheme(final_scheme) self.report(step) return (final_scheme)
def setup(self): log.warning( "Warning as of April 2016: We have noticed that the kmeans \ algorithm does not perform well on some simulated datasets. \ We are working on investigating and addressing this \ but in the mean time we suggest being very cautious about using \ this algorithm. At the very least, you should try other approaches \ (e.g. partitioning by locus), and investigate your answers carefully \ (both the trees and the partitioning schemes). If you have any \ questions, please get in touch on the google group.") # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", start_description) if len(start_scheme.subsets) > 1: log.error("The k-means algorithm is designed to analyse \ the entire alignment at once. To use it, please define a \ single data block that includes all of your sites, and \ again.") raise AnalysisError site_max = sum([len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." % (the_config.min_subset_size, site_max)) raise AnalysisError with logtools.indented( log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary( start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology': try: from _tiger import TigerDNA the_config.TigerDNA = TigerDNA except: log.error("Couldn't find compiled tiger code.") log.error("You have selected kmeans and tiger \ rates. This is an unsupported option, if you still wish to use \ this option, you must compile the tiger code.") log.error( "Once you compile the tiger code, this option will work. \ But please note that this is an \ unsupported option. For empirical work we recommend using \ entropy calculations for site rates, which is the default \ behaviour for the kmeans algorithm in PF2.") raise AnalysisError else: the_config.TigerDNA = None return start_result, start_scheme, tree_path
def finalise_fabrication(self, start_subsets, step): fabricated_subsets = [] for s in start_subsets: # here we put a sensible lower limit on the size of subsets if len(s.columns) < the_config.min_subset_size: s.fabricated = True log.debug("Subset %s with only %d sites found" % (s.subset_id, len(s.columns))) # here we can test if the alignment has all states: state_probs = self.alignment.check_state_probs(s, the_config) if state_probs: s.fabricated = True log.debug( "Subset %s does not have all states in the alignment", s.subset_id) if s.fabricated: fabricated_subsets.append(s) log.debug("added %s to fabricated subset", s.name) if fabricated_subsets: with logtools.indented(log, "Finalising partitioning scheme"): log.debug("There are %d/%d fabricated subsets" % (len(fabricated_subsets), len(start_subsets))) i = 1 while fabricated_subsets: all_subs = start_subsets # occasionally subsets with all value == 0.0 are given a # centroid of None by scikit-learn. The true entropy here # is 0.0 for all sites, so the true centroid is 0.0 for s in all_subs: if s.centroid == None: s.centroid = [0.0] log.debug("Fixed a subset with a centroid of None") log.debug("The subset has %d columns" % len(s.columns)) s = fabricated_subsets.pop(0) log.debug("Working on fabricated subset %s with %d sites" % (s.subset_id, len(s.columns))) log.info("Finalising subset %d", i) i = i + 1 all_subs.remove(s) centroid = s.centroid best_match = None # get closest subset to s for sub in all_subs: centroid_array = [sub.centroid, centroid] euclid_dist = spatial.distance.pdist(centroid_array) if euclid_dist < best_match or best_match is None: best_match = euclid_dist closest_sub = sub # join s with closest_sub to make joined_sub merged_sub = subset_ops.merge_subsets([s, closest_sub]) # remove closest sub all_subs.remove(closest_sub) # and if closest_sub was fabricated too, we remove it here if fabricated_subsets.count(closest_sub): fabricated_subsets.remove(closest_sub) # analyse joined sub self.analyse_list_of_subsets([merged_sub]) # here we put a sensible lower limit on the size of subsets if len(merged_sub.columns) < the_config.min_subset_size: merged_sub.fabricated = True # if joined has to be fabricated, add to fabricated list if merged_sub.fabricated: fabricated_subsets.append(merged_sub) all_subs.append(merged_sub) else: all_subs = start_subsets # now build a scheme from start_subs, and it should work final_scheme = scheme.Scheme(the_config, "final_scheme", all_subs) # return final scheme return final_scheme
def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented( log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix( subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int( math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix < 0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair( c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix( c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info( "The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info( "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim * dim) - dim)) / 2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info( "The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def setup(self): if the_config.datatype != 'morphology': log.warning("METHOD DISCONTINUED: \ There is increasing evidence that the kmeans \ algorithm can lead to poor inferences, so we have \ discontinued its use for most data types. \ You should instead use other approaches \ (e.g. partitioning by locus and codon position). If you have any \ questions, please get in touch on the google group. More \ information on the empirical issues \ can be found in this paper: \ http://www.sciencedirect.com/science/article/pii/S1055790316302780." ) raise AnalysisError else: log.warning("USE CAUTION: \ There is increasing evidence that the kmeans \ algorithm can lead to poor inferences, so we have \ discontinued its use for most data types \ (i.e. amino acid and nucleotide data). \ More information on the empirical issues \ can be found in this paper: \ http://www.sciencedirect.com/science/article/pii/S1055790316302780. \ We have kept the method available for morphological \ data, but warn users that the method is: experimental, \ untested on morphological data (either empirical or \ simulated), and may give incorrect topologies and branch \ lengths (see link to paper above)." ) # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", start_description) site_max = sum([ len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." %(the_config.min_subset_size, site_max) ) raise AnalysisError with logtools.indented(log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary(start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology': log.error("You have selected kmeans and tiger \ rates. This is an unsupported option for anything except \ morphological data. The kmeans algorithm \ now works with entropies, not TIGER rates.") raise AnalysisError return start_result, start_scheme, tree_path
def setup(self): # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", start_description) if len(start_scheme.subsets) > 1: log.error("The k-means algorithm is designed to analyse \ the entire alignment at once. To use it, please define a \ single data block that includes all of your sites, and \ again.") raise AnalysisError site_max = sum([len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." % (the_config.min_subset_size, site_max)) raise AnalysisError with logtools.indented( log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary( start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger': try: from _tiger import TigerDNA the_config.TigerDNA = TigerDNA except: log.error("Couldn't find compiled tiger code.") log.error("You have selected kmeans and tiger \ rates. This is an unsupported option, if you still wish to use \ this option, you must compile the tiger code.") log.error( "Once you compile the tiger code, this option will work. \ But please note that this is an \ unsupported option. For empirical work we recommend using \ entropy calculations for site rates, which is the default \ behaviour for the kmeans algorithm in PF2.") raise AnalysisError else: the_config.TigerDNA = None return start_result, start_scheme, tree_path