def install_base(self): db_conn = psycopg2.connect( host=pg_conf['host'], port=pg_conf['port'], user=pg_conf['login'], password=pg_conf['password'], database=pg_conf['db_name'], cursor_factory=DictCursor ) create_scheme(db_conn, self.options['current_folder']+'/scheme/scheme.sql') parsers = [ #AddressObjectType, #CenterStatus, #CurrentStatus, #OperationStatus, #ActualStatus, #IntervalStatus, #StructureStatus, #HouseStateStatus, #EstateStatus, #NormativeDocumentType, #NormativeDocument, #AddressObject, #House, #HouseInterval, # Room, Stead, Landmark, ] for parser in parsers: obj_parser = parser(db_connection=db_conn, archive=self.options['current_folder']+'/fias_xml.rar') obj_parser.on('start_element:handled', change_state) state['install'][parser.__name__] = { 'status': 'started', 'started_at': str(datetime.now()), 'stopped_at': None, 'records_added': obj_parser.records_counter } try: obj_parser.parse() except Exception as e: state['install'][parser.__name__].update({ 'status': 'failed', 'reason': str(e), 'stopped_at': str(datetime.now()), 'records_added': obj_parser.records_counter }) db_conn.commit() state['install'][parser.__name__].update({ 'status': 'complete', 'stopped_at': str(datetime.now()), 'records_added': obj_parser.records_counter })
def setup(self): log.warning( "Warning as of April 2016: We have noticed that the kmeans \ algorithm does not perform well on some simulated datasets. \ We are working on investigating and addressing this \ but in the mean time we suggest being very cautious about using \ this algorithm. At the very least, you should try other approaches \ (e.g. partitioning by locus), and investigate your answers carefully \ (both the trees and the partitioning schemes). If you have any \ questions, please get in touch on the google group. Note that this \ warning does not apply to cases where you are using models that have \ an ascertainment bias for datasets that include only variable sites \ as is often the case with morphological analyses.") # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", start_description) site_max = sum([len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." % (the_config.min_subset_size, site_max)) raise AnalysisError with logtools.indented( log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary( start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology': log.error("You have selected kmeans and tiger \ rates. This is an unsupported option for anything except \ morphological data. The kmeans algorithm \ now works with entropies, not TIGER rates.") raise AnalysisError return start_result, start_scheme, tree_path
def do_analysis(self): log.info("Performing clustering analysis") partnum = len(self.cfg.partitions) subset_count = 2 * partnum - 1 scheme_count = partnum self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) # Analyse our first scheme log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) # Current scheme number cur_s = 2 # Now we try out all clusterings of the first scheme, to see if we can # find a better one while True: log.info("***Clustering algorithm step %d of %d***" % (cur_s - 1, partnum - 1)) # Calculate the subsets which are most similar # e.g. combined rank ordering of euclidean distances # Could combine average site-rates, q matrices, and frequencies scheme_name = "step_%d" % (cur_s - 1) clustered_scheme = neighbour.get_nearest_neighbour_scheme( start_scheme, scheme_name, self.cfg) # Now analyse that new scheme cur_s += 1 self.analyse_scheme(clustered_scheme) # Stop when we've anlaysed the scheme with all subsets combined if len(set(clustered_scheme.subsets)) == 1: # then it's the scheme with everything together break else: start_scheme = clustered_scheme self.cfg.progress.end() txt = "Best scheme using Clustering Analysis, analysed with %s" % self.cfg.model_selection self.cfg.reporter.write_best_scheme(txt, self.results)
def do_analysis(self): log.info("Performing strict clustering analysis") partnum = len(the_config.user_subsets) subset_count = 2 * partnum - 1 scheme_count = partnum the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", start_description) # Analyse our first scheme log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) # Current scheme number cur_s = 2 # Now we try out all clusterings of the first scheme, to see if we can # find a better one while True: log.info("***Strict clustering algorithm step %d of %d***" % (cur_s - 1, partnum - 1)) # Calculate the subsets which are most similar # e.g. combined rank ordering of euclidean distances # Could combine average site-rates, q matrices, and frequencies scheme_name = "step_%d" % (cur_s - 1) clustered_scheme = neighbour.get_nearest_neighbour_scheme( start_scheme, scheme_name, the_config) # Now analyse that new scheme cur_s += 1 self.analyse_scheme(clustered_scheme) # Stop when we've analysed the scheme with all subsets combined... if len(set(clustered_scheme.subsets)) == 1: # ... then it's the scheme with everything together break else: # We keep going start_scheme = clustered_scheme the_config.progress.end() the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): partnum = len(self.cfg.user_subsets) start_description = range(partnum) log.info("Performing subset splitting using kmeans") # Create the first scheme start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description) for i in start_scheme: # Save the alignment path i.make_alignment(self.cfg, self.alignment) phylip_file = i.alignment_path print phylip_file # Add option to output likelihoods, *raxml version takes more # modfying of the commands in the analyse function phyml.analyse("GTR", str(phylip_file), "./analysis/start_tree/filtered_source.phy_phyml_tree.txt", "unlinked", "--print_site_lnl") phyml_lk_file = str(phylip_file) + "_phyml_lk_GTR.txt" likelihood_dictionary = kmeans.phyml_likelihood_parser(phyml_lk_file) kmeans.kmeans(likelihood_dictionary) print start_scheme
def setup(self): if the_config.datatype != 'morphology': log.warning("METHOD DISCONTINUED: \ There is increasing evidence that the kmeans \ algorithm can lead to poor inferences, so we have \ discontinued its use for most data types. \ You should instead use other approaches \ (e.g. partitioning by locus and codon position). If you have any \ questions, please get in touch on the google group. More \ information on the empirical issues \ can be found in this paper: \ http://www.sciencedirect.com/science/article/pii/S1055790316302780." ) raise AnalysisError else: log.warning("USE CAUTION: \ There is increasing evidence that the kmeans \ algorithm can lead to poor inferences, so we have \ discontinued its use for most data types \ (i.e. amino acid and nucleotide data). \ More information on the empirical issues \ can be found in this paper: \ http://www.sciencedirect.com/science/article/pii/S1055790316302780. \ We have kept the method available for morphological \ data, but warn users that the method is: experimental, \ untested on morphological data (either empirical or \ simulated), and may give incorrect topologies and branch \ lengths (see link to paper above)." ) # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", start_description) site_max = sum([ len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." %(the_config.min_subset_size, site_max) ) raise AnalysisError with logtools.indented(log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary(start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology': log.error("You have selected kmeans and tiger \ rates. This is an unsupported option for anything except \ morphological data. The kmeans algorithm \ now works with entropies, not TIGER rates.") raise AnalysisError return start_result, start_scheme, tree_path
def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) if the_config.cluster_max == -987654321: the_config.cluster_max = max([1000, (10 * len(the_config.user_subsets))]) log.info("Set rcluster-max to %d" %the_config.cluster_max) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented(log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix(subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo)>0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" %(step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change>=0: log.info("Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix<0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name)) log.debug("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info("Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim*dim)-dim))/2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo)>0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" %(step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change>=0: log.info("Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name)) log.info("The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
groups = grouped(matrix_out.tolist()) print('\nGroups:') for i in groups.items(): #print('\t', i[0], 'group:', i[1], set([k for j in i[1] for k in matrix_opr[j]])) print("\t {0:<1d} group: {1!s:<30s} -> {2!s:<100s}".format(i[0], i[1], set([k for j in i[1] for k in matrix_opr[j]]))) specify_group1 = gks3(groups, matrix_opr) # some bug with this fubction refin_groups = gks3(groups, matrix_opr) # for it works well we must call twice this func print('\nRefined groups:') for i in refin_groups.items(): #print('\t', i[0], 'group:', i[1], set([k for j in i[1] for k in matrix_opr[j]])) print("\t {0:<1d} group: {1!s:<35s} -> {2!s:<100s}".format(i[0], i[1], set([k for j in i[1] for k in matrix_opr[j]]))) module = {} for i in refin_groups.keys(): module[i] = draw_graph_create_module(matrix_opr, refin_groups[i], 'group'+str(i)) print('\nModules:') for i in module.items(): print('\t', i[0], 'group:') for j in i[1].items(): print('\t', j[0], j[1]) print('\n') un_mod = unique_module(module) print('\nRefined modules:') for i in un_mod.items(): print('\t', i[0], ':', i[1]) create_scheme(matrix_opr, un_mod) input("\nTo exit, press any key")
def do_analysis(self): # Copied and pasted from greedy analysis partnum = len(self.cfg.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) old_score = self.analyse_scheme(start_scheme) # Get first scheme best_scheme = start_scheme subset_index = 0 processor = self.cfg.processor alignment_path = self.filtered_alignment_path tree_path = processor.make_tree_path(alignment_path) split_subsets = [] for a_subset in start_scheme: how_many = kmeans.kmeans_wrapper(self.cfg, self.alignment, a_subset, tree_path) split_subsets += how_many split_scheme = scheme.Scheme(self.cfg, "split_scheme", split_subsets) best_result = self.analyse_scheme(best_scheme) split_score = self.analyse_scheme(split_scheme) if split_score.score < best_result.score: best_scheme = split_scheme log.info("Initial splits generated superior scheme") all_subsets = list(best_scheme.subsets) fabricated_subsets =[] step = 1 while subset_index < len(all_subsets): log.info("Best scheme has %s score of %.2f and %d subset(s)" %(self.cfg.model_selection.upper(), best_result.score, len(best_scheme.subsets))) log.info("***Kmeans algorithm step %d***" % step) step += 1 current_subset = all_subsets[subset_index] log.info("Analysing subset of %d sites", len(current_subset.columns)) # First check if the subset is large enough to split, if it isn't, # move to the next subset if len(current_subset.columns) == 1: log.info("This subset cannot be split further") subset_index += 1 continue if current_subset.fabricated: log.info("This subset cannot be split further because %s cannot analyse it", self.cfg.phylogeny_program) subset_index += 1 fabricated_subsets.append(current_subset) continue split_subsets = kmeans.kmeans_split_subset(self.cfg, self.alignment, current_subset, tree_path) # kmeans_split_subset will return a 1 and flag the subset as # fabricated if for some reason it raises a PhylogenyProgramError, # this it to catch those fabricated subsets if split_subsets == 1: subset_index += 1 fabricated_subsets.append(current_subset) continue for each_subset in split_subsets: log.info("Subset resulting from split is %d sites long", len(each_subset.columns)) # Take a copy updated_subsets = all_subsets[:] # Replace the current one with the split one # Google "slice assignments" # This list is the key to avoiding recursion. It expands to contain # all of the split subsets by replacing them with the split ones updated_subsets[subset_index:subset_index+1] = split_subsets test_scheme = scheme.Scheme(self.cfg, str(step-1), updated_subsets) new_result = self.analyse_scheme(test_scheme) if new_result.score < best_result.score: best_scheme = test_scheme best_result = new_result # Change this to the one with split subsets in it. Note that # the subset_index now points a NEW subset, one that was split all_subsets = updated_subsets # record each scheme that's an improvement self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) if len(split_subsets)==2: log.info("Splitting subset into %d:%d sites improved the %s score" %(len(split_subsets[0].columns), len(split_subsets[1].columns), self.cfg.model_selection)) for s in split_subsets: m = [x%3 for x in s.columns] l = float(len(s.columns)) props = [(float(m.count(1))/l), (float(m.count(2))/l), (float(m.count(0))/l)] log.info("%d subset has 1st, 2nd, 3rd props: %s" %(len(s.columns), str(props))) else: log.info("Splitting this subset did not improve the %s score", self.cfg.model_selection.upper()) # Move to the next subset in the all_subsets list subset_index += 1 log.info("Best scheme has %s score of %.2f and %d subset(s)" %(self.cfg.model_selection.upper(), best_result.score, len(best_scheme.subsets))) if fabricated_subsets: log.info("Finalising partitioning scheme") log.info("This involves cleaning up small subsets which %s " "can't analyse", self.cfg.phylogeny_program) # Now join the fabricated subsets back up with other subsets while fabricated_subsets: log.info("***Kmeans algorithm step %d***" % step) step += 1 # Take the first subset in the list (to be "popped" off later) s = fabricated_subsets[0] centroid = s.centroid best_match = None # Take a list copy of the best scheme scheme_list = list(best_scheme) scheme_list.remove(s) # Loop through the subsets in the best scheme and find the one # with the nearest centroid for sub in scheme_list: centroid_array = [sub.centroid, centroid] # euclid_dist = abs(sub.centroid[0] - centroid[0]) warnings.simplefilter('ignore', DeprecationWarning) euclid_dist = spatial.distance.pdist(centroid_array) if euclid_dist < best_match or best_match == None: best_match = euclid_dist closest_sub = sub # Now merge those subsets merged_sub = subset_ops.merge_fabricated_subsets([s, closest_sub]) # Remove the offending subset from the fabricated subset list fabricated_subsets.pop(0) # If the closest subset happens to be "fabricated" as well, take # it out of the fabricated_subsets list if closest_sub in fabricated_subsets: fabricated_subsets.remove(closest_sub) # Get rid of the two subsets that were merged from the best_scheme scheme_list.remove(closest_sub) # Now add the new subset to the scheme and see if the new subset # can be analyzed scheme_list.append(merged_sub) merged_scheme = scheme.Scheme(self.cfg, str(step-1), scheme_list) merged_result = self.analyse_scheme(merged_scheme) # If it can be analyzed, move the algorithm forward, if it can't # be analyzed add it to the list of fabricated_subsets for new_subs in merged_scheme: if new_subs.fabricated and new_subs not in fabricated_subsets: fabricated_subsets.append(new_subs) best_scheme = merged_scheme best_result = merged_result # Since the AIC will likely be better before we dealt with the # fabricated subsets, we need to set the best scheme and best result # to those from the last merged_scheme. TODO: add a variable to scheme # to take care of this problem so that the best AND analysable scheme # is the one that gets automatically flagged as the best scheme self.results.best_scheme = best_scheme self.results.best_result = best_result self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) log.info("** Kmeans algorithm finished after %d steps **" % (step - 1)) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): log.info("Performing greediest analysis") stop_at = self.cfg.greediest_percent * 0.01 model_selection = self.cfg.model_selection partnum = len(self.cfg.partitions) scheme_count = submodels.count_greediest_schemes(partnum, self.cfg.greediest_percent) subset_count = submodels.count_greediest_subsets(partnum, self.cfg.greediest_percent) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Start by remembering that we analysed the starting scheme subset_counter = 1 step = 1 while True: log.info("***Greediest algorithm step %d of %d***" % (step, partnum - 1)) name_prefix = "step_%d" % (step) # Get a list of all possible lumpings of the best_scheme, ordered # according to the clustering weights lumped_subsets = neighbour.get_ranked_clustered_subsets( start_scheme, self.cfg) # reduce the size of the lumped subsets to greediest_percent long cutoff = int(math.ceil(len(lumped_subsets)*stop_at)) #round up to stop zeros lumped_subsets = lumped_subsets[:cutoff] # Now analyse the lumped schemes lumpings_done = 0 old_best_score = self.results.best_score for subset_grouping in lumped_subsets: scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1) lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.info("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) lumpings_done += 1 if self.results.best_score != old_best_score: log.info("Analysed %.1f percent of the schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.greediest_percent, self.cfg.model_selection, (self.results.best_score - old_best_score)) #write out the best scheme self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed %.1f percent of the schemes for this step and found no schemes " "that improve the score, stopping" , self.cfg.greediest_percent) break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Greediest algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) txt = "Best scheme according to Greediest algorithm, analysed with %s" % self.cfg.model_selection self.cfg.reporter.write_best_scheme(txt, self.results)
def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset_ops.merge_subsets( the_config.user_subsets) self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join(the_config.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Check the full subset against the alignment subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config) # We start by copying the alignment self.alignment_path = os.path.join(the_config.start_tree_path, 'source.phy') # Now check for the tree tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path): log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(the_config.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s" % user_path) topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy') util.dupfile(user_path, topology_path) need_bl = True elif the_config.no_ml_tree == True: log.debug("didn't find tree at %s, making a new one" % tree_path) topology_path = the_config.processor.make_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras) need_bl = True elif the_config.no_ml_tree == False: log.debug( "didn't find tree at %s, making an ML tree with RAxML" % tree_path) tree_scheme = scheme.create_scheme( the_config, "tree_scheme", range(len(the_config.user_subsets))) topology_path = raxml.make_ml_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads) # here we copy the ML tree topology so it can be used with PhyML too # TODO: this is a hack, and it would be better to decide on a universal # name for the different types of tree we might have. phyml_tree = os.path.join( os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt") copyfile(topology_path, phyml_tree) need_bl = False if need_bl == True: # Now estimate branch lengths tree_path = the_config.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, the_config.datatype, the_config.cmdline_extras) self.tree_path = tree_path log.debug("Starting tree with branch lengths is here: %s" % self.tree_path)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' partnum = len(the_config.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] step = 1 while len(set(start_scheme.subsets)) > 1: with logtools.indented(log, "***Greedy algorithm step %d***" % step): name_prefix = "step_%d" % (step) # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) # this is a fake distance matrix, so that the greedy algorithm # can use all the tricks of the relaxed clustering algorithm dim = len(subsets) d_matrix = np.zeros((((dim * dim) - dim)) / 2) d_matrix[:] = np.inf if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) log.debug("Biggest improvement in info score: %s", str(best_change)) if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) best_result = self.analyse_scheme(best_scheme) # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_change = self.results.best_score - start_score log.info("Best scheme combines subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info( "The best scheme improves the %s score by %.2f to %.1f", the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score log.debug("Best pair: %s", str([s.name for s in best_pair])) log.debug("Merged into: %s", str([best_merged.name])) # 5. reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets) # we updated the subset list in a special way, which matches how we update the c matrix: subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, the_config.model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def do_analysis(self): log.info("Performing relaxed clustering analysis") stop_at = self.cfg.cluster_percent * 0.01 model_selection = self.cfg.model_selection partnum = len(self.cfg.partitions) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, self.cfg.cluster_percent) subset_count = submodels.count_relaxed_clustering_subsets( partnum, self.cfg.cluster_percent) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) self.cfg.reporter.write_scheme_summary(self.results.best_scheme, self.results.best_result) # Start by remembering that we analysed the starting scheme subset_counter = 1 step = 1 while True: log.info("***Relaxed clustering algorithm step %d of %d***" % (step, partnum - 1)) name_prefix = "step_%d" % (step) # Get a list of all possible lumpings of the best_scheme, ordered # according to the clustering weights lumped_subsets = neighbour.get_ranked_clustered_subsets( start_scheme, self.cfg) # reduce the size of the lumped subsets to cluster_percent long cutoff = int(math.ceil(len(lumped_subsets) * stop_at)) #round up to stop zeros lumped_subsets = lumped_subsets[:cutoff] # Now analyse the lumped schemes lumpings_done = 0 old_best_score = self.results.best_score for subset_grouping in lumped_subsets: scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1) lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score - old_best_score)) lumpings_done += 1 if self.results.best_score != old_best_score: log.info( "Analysed %.1f percent of the schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.cluster_percent, self.cfg.model_selection, (self.results.best_score - old_best_score)) #write out the best scheme self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info( "Analysed %.1f percent of the schemes for this step and found no schemes " "that improve the score, stopping", self.cfg.cluster_percent) break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' log.info("Performing greedy analysis") partnum = len(self.cfg.partitions) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme(self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) # Get a list of all possible lumpings of the best_scheme lumpings = algorithm.lumpings(start_description) # Save the current best score we have in results old_best_score = self.results.best_score for lumped_description in lumpings: lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description) cur_s += 1 # This is just checking to see if a scheme is any good, if it # is, we remember and write it later self.analyse_scheme(lumped_scheme) # Did out best score change (It ONLY gets better -- see in # results.py) if self.results.best_score == old_best_score: # It didn't, so we're done break # Let's look further. We use the description from our best scheme # (which will be the one that just changed in the last lumpings # iteration) start_description = self.results.best_result.scheme.description # Rename and record the best scheme for this step self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary(self.results.best_scheme, self.results.best_result) # If it's the scheme with everything equal, quit if len(set(start_description)) == 1: break # Go do the next round... step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Highest scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): # Copied and pasted from greedy analysis partnum = len(self.cfg.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) old_score = self.analyse_scheme(start_scheme) # Get first scheme best_scheme = start_scheme subset_index = 0 all_subsets = list(best_scheme.subsets) processor = self.cfg.processor alignment_path = self.filtered_alignment_path tree_path = processor.make_tree_path(alignment_path) while subset_index < len(all_subsets): current_subset = all_subsets[subset_index] split_subsets = kmeans.kmeans_split_subset(self.cfg, self.alignment, current_subset, tree_path) if split_subsets == 1: subset_index += 1 else: # Take a copy updated_subsets = all_subsets[:] # Replace the current one with the split one # Google "slice assignments" # This list is the key to avoiding recursion. It expands to contain # all of the split subsets by replacing them with the split ones updated_subsets[subset_index:subset_index+1] = split_subsets test_scheme = scheme.Scheme(self.cfg, "Current Scheme", updated_subsets) try: best_result = self.analyse_scheme(best_scheme) new_result = self.analyse_scheme(test_scheme) log.info("Current best score is: " + str(best_result)) log.info("Current new score is: " + str(new_result)) if new_result.score < best_result.score: log.info("New score " + str(subset_index) + " is better and will be set to best score") best_scheme = test_scheme # Change this to the one with split subsets in it. Note that # the subset_index now points a NEW subset, one that was split all_subsets = updated_subsets else: # Move to the next subset in the all_subsets list subset_index += 1 # In PhyML or RAxML, it is likely because of no alignment patterns, # catch that and move to the next subset without splitting. except PhylogenyProgramError: log.info("Phylogeny program generated an error so this subset was not split, see error above") subset_index += 1 # Now start the Greedy Analysis: need to figure out how to make it go through more # than one scheme... start_scheme = best_scheme partnum = len(start_scheme.subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) start_description = range(partnum) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) old_best_score = self.results.best_score # Get an iterable of all possible pairs of subsets in best_scheme lumped_subsets = itertools.combinations(start_scheme.subsets, 2) for subset_grouping in lumped_subsets: scheme_name = cur_s lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) cur_s += 1 if self.results.best_score != old_best_score: log.info("Analysed all schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.model_selection, (self.results.best_score - old_best_score)) self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed all schemes for this step and found no schemes " "that improve the score, stopping") break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): bic_score_list = [] # Copied and pasted from greedy analysis partnum = len(self.cfg.user_subsets) self.cfg.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) old_score = self.analyse_scheme(start_scheme) # Get first scheme best_scheme = start_scheme subset_index = 0 all_subsets = list(best_scheme.subsets) original_subset = all_subsets[0] processor = self.cfg.processor alignment_path = self.filtered_alignment_path tree_path = processor.make_tree_path(alignment_path) best_result = self.analyse_scheme(best_scheme) bic_score_list.append(best_result.score) log.info("Starting score is %s" % best_result.score) fabricated_subsets =[] step = 1 num_ks = 2 last_ks = 0 # Grab CIs from pre-created file per_site_statistics = processor.get_CIs(self.cfg) likelihood_list = per_site_statistics original_subset.site_lnls_GTRG = likelihood_list per_site_stat_list = likelihood_list cap = False high_ks = 0 while num_ks > high_ks: log.info("*** K-means bisection search step %i ***" % step) step += 1 # Then we use k-means to split the alignment into 2 subsets and # create a new scheme, see if that improves the overall score, if # it does we move onto the next step new_subs = kmeans.kmeans_var_ks(self.cfg, original_subset, num_ks, per_site_stat_list) new_scheme = scheme.Scheme(self.cfg, str(step-1), new_subs) new_result = self.analyse_scheme(new_scheme) # Bisection search: with improvements, set num_ks to double. When # the BIC score doesn't improve, set num_ks into the middle # between the last one that improved the BIC and the current # number if new_result.score < best_result.score: log.info("Score improved with %s k's" % num_ks) log.info("New score is: %d" % new_result.score) best_result = new_result best_scheme = new_scheme last_ks = num_ks high_ks = num_ks if cap: num_ks = (new_cap + num_ks)/2 else: num_ks = (num_ks*2) else: log.info("Didn't get better with %s k's, bisecting..." % num_ks) cap = True new_cap = num_ks num_ks = (last_ks + num_ks)/2 log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)
def do_analysis(self): # initialisation steps model_selection = the_config.model_selection partnum = len(the_config.user_subsets) scheme_count = submodels.count_relaxed_clustering_schemes( partnum, the_config.cluster_percent, the_config.cluster_max) subset_count = submodels.count_relaxed_clustering_subsets( partnum, the_config.cluster_percent, the_config.cluster_max) log.info("PartitionFinder will have to analyse %d subsets to" " complete this analyses" % subset_count) the_config.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme, and record it. with logtools.indented(log, "*** Analysing starting scheme ***"): the_config.progress.begin(scheme_count, partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", range(partnum)) start_result = self.analyse_scheme(start_scheme) start_score = start_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) subsets = [s for s in start_scheme.subsets] partnum = len(subsets) step = 1 while True: with logtools.indented( log, "*** Relaxed clustering algorithm step %d of up to %d ***" % (step, partnum - 1)): # get distances between subsets max_schemes = comb(len(start_scheme.subsets), 2) log.info("Measuring the similarity of %d subset pairs" % max_schemes) d_matrix = neighbour.get_distance_matrix( subsets, the_config.cluster_weights) if step == 1: # Now initialise a change in info score matrix to inf c_matrix = np.empty(d_matrix.shape) c_matrix[:] = np.inf c_matrix = spatial.distance.squareform(c_matrix) # 1. pick top N subset pairs from distance matrix cutoff = int( math.ceil(max_schemes * (the_config.cluster_percent * 0.01))) if cutoff <= 0: cutoff = 1 if the_config.cluster_max != None and cutoff > the_config.cluster_max: cutoff = the_config.cluster_max log.info("Choosing the %d most similar subset pairs" % cutoff) closest_pairs = neighbour.get_N_closest_subsets( subsets, the_config, cutoff, d_matrix) # 2. analyse K subsets in top N that have not yet been analysed pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets) if len(pairs_todo) > 0: log.info("Analysing %d new subset pairs" % len(pairs_todo)) new_subs = [] sub_tuples = [] for pair in pairs_todo: new_sub = subset_ops.merge_subsets(pair) new_subs.append(new_sub) sub_tuples.append((new_sub, pair)) the_config.progress.begin(scheme_count, len(new_subs)) self.analyse_list_of_subsets(new_subs) # 3. for all K new subsets, update improvement matrix and find best pair log.info("Finding the best partitioning scheme") diffs = [] scheme_name = "step_%d" % (step) for t in sub_tuples: pair_merged = t[0] pair = t[1] new_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, pair, pair_merged, the_config) r = self.analyse_scheme(new_scheme) diff = r.score - start_score diffs.append(diff) c_matrix = neighbour.update_c_matrix( c_matrix, sub_tuples, subsets, diffs) # 4. Find the best pair of subsets, and build a scheme based on that # note that this matrix includes diagonals, which will all be zero # since this is equivalent to comparing a scheme to itself. # so we need to be careful to only proceed if we have a negative change # which indicates an improvement in the score best_change = np.amin(c_matrix) best_scheme = start_scheme if best_change >= 0: log.info( "Found no schemes that improve the score, stopping") break median_improvement = np.median(c_matrix[c_matrix < 0]) while best_change <= median_improvement: best_pair = neighbour.get_best_pair( c_matrix, best_change, subsets) best_merged = subset_ops.merge_subsets(best_pair) best_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, best_pair, best_merged, the_config) start_scheme = best_scheme log.info("Combining subsets: '%s' and '%s'" % (best_pair[0].name, best_pair[1].name)) log.info("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change))) # reset_c_matrix and the subset list c_matrix = neighbour.reset_c_matrix( c_matrix, list(best_pair), [best_merged], subsets) # we update the subset list in a way that means its structure tracks the c-matrix subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged]) best_change = np.amin(c_matrix) if the_config.search == 'rcluster': break # otherwise we are using rclusterf, which continues in this loop # i.e. with rcluster we just take the single best change # the best change can get updated a fraction at this point # because calaculting the info score on the whole alignment # is a little different from doing it on the one subset best_result = self.analyse_scheme(best_scheme) best_change = self.results.best_score - start_score log.info( "The best scheme has %d subsets and improves the %s score by %.2f to %.1f", len(best_scheme.subsets), the_config.model_selection, np.abs(best_change), self.results.best_score) start_scheme = best_scheme start_score = best_result.score if not the_config.quick: the_config.reporter.write_scheme_summary( best_scheme, best_result) if len(set(start_scheme.subsets)) == 1: break step += 1 log.info("Relaxed clustering algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) if the_config.min_subset_size or the_config.all_states: best_scheme = self.clean_scheme(self.results.best_scheme) best_result = self.analyse_scheme(best_scheme) # scores after cleaning can be worse, so we reset these trackers... self.results.best_result = best_result self.results.best_score = best_result.score self.results.best_scheme = best_scheme log.info( "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, model_selection, self.results.best_score)) the_config.reporter.write_best_scheme(self.results)
def setup(self): # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", start_description) if len(start_scheme.subsets) > 1: log.error("The k-means algorithm is designed to analyse \ the entire alignment at once. To use it, please define a \ single data block that includes all of your sites, and \ again.") raise AnalysisError site_max = sum([len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." % (the_config.min_subset_size, site_max)) raise AnalysisError with logtools.indented( log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary( start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger': try: from _tiger import TigerDNA the_config.TigerDNA = TigerDNA except: log.error("Couldn't find compiled tiger code.") log.error("You have selected kmeans and tiger \ rates. This is an unsupported option, if you still wish to use \ this option, you must compile the tiger code.") log.error( "Once you compile the tiger code, this option will work. \ But please note that this is an \ unsupported option. For empirical work we recommend using \ entropy calculations for site rates, which is the default \ behaviour for the kmeans algorithm in PF2.") raise AnalysisError else: the_config.TigerDNA = None return start_result, start_scheme, tree_path
def setup(self): log.warning( "Warning as of April 2016: We have noticed that the kmeans \ algorithm does not perform well on some simulated datasets. \ We are working on investigating and addressing this \ but in the mean time we suggest being very cautious about using \ this algorithm. At the very least, you should try other approaches \ (e.g. partitioning by locus), and investigate your answers carefully \ (both the trees and the partitioning schemes). If you have any \ questions, please get in touch on the google group.") # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme(the_config, "start_scheme", start_description) if len(start_scheme.subsets) > 1: log.error("The k-means algorithm is designed to analyse \ the entire alignment at once. To use it, please define a \ single data block that includes all of your sites, and \ again.") raise AnalysisError site_max = sum([len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." % (the_config.min_subset_size, site_max)) raise AnalysisError with logtools.indented( log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary( start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology': try: from _tiger import TigerDNA the_config.TigerDNA = TigerDNA except: log.error("Couldn't find compiled tiger code.") log.error("You have selected kmeans and tiger \ rates. This is an unsupported option, if you still wish to use \ this option, you must compile the tiger code.") log.error( "Once you compile the tiger code, this option will work. \ But please note that this is an \ unsupported option. For empirical work we recommend using \ entropy calculations for site rates, which is the default \ behaviour for the kmeans algorithm in PF2.") raise AnalysisError else: the_config.TigerDNA = None return start_result, start_scheme, tree_path
def do_analysis(self): """A greedy algorithm for heuristic partitioning searches""" log.info("Performing greedy analysis") models = self.cfg.models model_selection = self.cfg.model_selection partnum = len(self.cfg.partitions) self.total_scheme_num = submodels.count_greedy_schemes(partnum) log.info("This will result in a maximum of %s schemes being created", self.total_scheme_num) self.total_subset_num = submodels.count_greedy_subsets(partnum) log.info( "PartitionFinder will have to analyse a maximum of %d subsets of sites to complete this analysis" % (self.total_subset_num) ) if self.total_subset_num > 10000: log.warning("%d is a lot of subsets, this might take a long time to analyse", self.total_subset_num) log.warning("Perhaps consider using a different search scheme instead (see Manual)") # clear any schemes that are currently loaded # TODO Not sure we need this... self.cfg.schemes.clear_schemes() # start with the most partitioned scheme start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme(self.cfg, 1, start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) result = self.analyse_scheme(start_scheme, models) def get_score(my_result): # TODO: this is bad. Should use self.cfg.model_selection, or write # a new model_selection for scheme.py if model_selection == "aic": score = my_result.aic elif model_selection == "aicc": score = my_result.aicc elif model_selection == "bic": score = my_result.bic else: log.error("Unrecognised model_selection variable '%s', please check" % (score)) raise AnalysisError return score best_result = result best_score = get_score(result) step = 1 cur_s = 2 # now we try out all lumpings of the current scheme, to see if we can find a better one # and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) # get a list of all possible lumpings of the best_scheme lumpings = algorithm.lumpings(start_description) # we reset the counters as we go, for better user information self.total_scheme_num = len(lumpings) self.schemes_analysed = 0 best_lumping_score = None for lumped_description in lumpings: lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description) cur_s += 1 result = self.analyse_scheme(lumped_scheme, models) new_score = get_score(result) if best_lumping_score == None or new_score < best_lumping_score: best_lumping_score = new_score best_lumping_result = result best_lumping_scheme = lumped_scheme best_lumping_desc = lumped_description if best_lumping_score < best_score: best_scheme = best_lumping_scheme best_score = best_lumping_score best_result = best_lumping_result start_description = best_lumping_desc if len(set(best_lumping_desc)) == 1: # then it's the scheme with everything equal, so quit break step += 1 else: break log.info("Greedy algorithm finished after %d steps" % step) log.info( "Highest scoring scheme is scheme %s, with %s score of %.3f" % (best_result.scheme.name, model_selection, best_score) ) self.best_result = best_result
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' log.info("Performing greedy analysis") partnum = len(self.cfg.partitions) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(len(self.cfg.partitions)) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) # Get a list of all possible lumpings of the best_scheme lumpings = algorithm.lumpings(start_description) # Save the current best score we have in results old_best_score = self.results.best_score for lumped_description in lumpings: lumped_scheme = scheme.create_scheme(self.cfg, cur_s, lumped_description) cur_s += 1 # This is just checking to see if a scheme is any good, if it # is, we remember and write it later self.analyse_scheme(lumped_scheme) # Did out best score change (It ONLY gets better -- see in # results.py) if self.results.best_score == old_best_score: # It didn't, so we're done break # Let's look further. We use the description from our best scheme # (which will be the one that just changed in the last lumpings # iteration) start_description = self.results.best_result.scheme.description # Rename and record the best scheme for this step self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # If it's the scheme with everything equal, quit if len(set(start_description)) == 1: break # Go do the next round... step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Highest scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) txt = "Best scheme according to Greedy algorithm, analysed with %s" % self.cfg.model_selection self.cfg.reporter.write_best_scheme(txt, self.results)
def setup(self): # set the default subset size to 100 for kmeans analyses if the_config.min_subset_size == False: the_config.min_subset_size = 100 partnum = len(the_config.user_subsets) the_config.progress.begin(1, 1) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( the_config, "start_scheme", start_description) if len(start_scheme.subsets)>1: log.error("The k-means algorithm is designed to analyse \ the entire alignment at once. To use it, please define a \ single data block that includes all of your sites, and \ again." ) raise AnalysisError site_max = sum([ len(s.columns) for s in start_scheme.subsets]) if the_config.min_subset_size > site_max: log.error("The minimum subset size must be smaller than the \ total number of sites you want to analyse. Your minimum \ subset size is %d, and your alignment is %d sites. Please \ check and try again." %(the_config.min_subset_size, site_max) ) raise AnalysisError with logtools.indented(log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name): start_result = self.analyse_scheme(start_scheme) if not the_config.quick: the_config.reporter.write_scheme_summary(start_scheme, start_result) tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if the_config.kmeans == 'tiger': try: from _tiger import TigerDNA the_config.TigerDNA = TigerDNA except: log.error("Couldn't find compiled tiger code.") log.error("You have selected kmeans and tiger \ rates. This is an unsupported option, if you still wish to use \ this option, you must compile the tiger code.") log.error("Once you compile the tiger code, this option will work. \ But please note that this is an \ unsupported option. For empirical work we recommend using \ entropy calculations for site rates, which is the default \ behaviour for the kmeans algorithm in PF2.") raise AnalysisError else: the_config.TigerDNA = None return start_result, start_scheme, tree_path
def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset_ops.merge_subsets(the_config.user_subsets) self.filtered_alignment = SubsetAlignment( self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join( the_config.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Check the full subset against the alignment subset_ops.check_against_alignment(subset_with_everything, self.alignment, the_config) # We start by copying the alignment self.alignment_path = os.path.join( the_config.start_tree_path, 'source.phy') # Now check for the tree tree_path = the_config.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path): log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(the_config.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s" % user_path) topology_path = os.path.join(the_config.start_tree_path, 'user_topology.phy') util.dupfile(user_path, topology_path) need_bl = True elif the_config.no_ml_tree == True: log.debug( "didn't find tree at %s, making a new one" % tree_path) topology_path = the_config.processor.make_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras) need_bl = True elif the_config.no_ml_tree == False: log.debug( "didn't find tree at %s, making an ML tree with RAxML" % tree_path) tree_scheme = scheme.create_scheme( the_config, "tree_scheme", range(len(the_config.user_subsets))) topology_path = raxml.make_ml_topology( self.filtered_alignment_path, the_config.datatype, the_config.cmdline_extras, tree_scheme, self.threads) # here we copy the ML tree topology so it can be used with PhyML too # TODO: this is a hack, and it would be better to decide on a universal # name for the different types of tree we might have. phyml_tree = os.path.join(os.path.dirname(topology_path), "filtered_source.phy_phyml_tree.txt") copyfile(topology_path, phyml_tree) need_bl = False if need_bl == True: # Now estimate branch lengths tree_path = the_config.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, the_config.datatype, the_config.cmdline_extras) self.tree_path = tree_path log.debug("Starting tree with branch lengths is here: %s" % self.tree_path)
def do_analysis(self): '''A greedy algorithm for heuristic partitioning searches''' log.info("Performing greedy analysis") partnum = len(self.cfg.user_subsets) scheme_count = submodels.count_greedy_schemes(partnum) subset_count = submodels.count_greedy_subsets(partnum) self.cfg.progress.begin(scheme_count, subset_count) # Start with the most partitioned scheme start_description = range(partnum) start_scheme = scheme.create_scheme( self.cfg, "start_scheme", start_description) log.info("Analysing starting scheme (scheme %s)" % start_scheme.name) self.analyse_scheme(start_scheme) step = 1 cur_s = 2 # Now we try out all lumpings of the current scheme, to see if we can # find a better one and if we do, we just keep going while True: log.info("***Greedy algorithm step %d***" % step) old_best_score = self.results.best_score # Get an iterable of all possible pairs of subsets in best_scheme lumped_subsets = itertools.combinations(start_scheme.subsets, 2) for subset_grouping in lumped_subsets: scheme_name = cur_s lumped_scheme = neighbour.make_clustered_scheme( start_scheme, scheme_name, subset_grouping, self.cfg) new_result = self.analyse_scheme(lumped_scheme) log.debug("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score)) cur_s += 1 if self.results.best_score != old_best_score: log.info("Analysed all schemes for this step. The best " "scheme changed the %s score by %.1f units.", self.cfg.model_selection, (self.results.best_score - old_best_score)) self.results.best_scheme.name = "step_%d" % step self.cfg.reporter.write_scheme_summary( self.results.best_scheme, self.results.best_result) # Now we find out which is the best lumping we know of for this step start_scheme = self.results.best_scheme else: log.info("Analysed all schemes for this step and found no schemes " "that improve the score, stopping") break # We're done if it's the scheme with everything together if len(set(lumped_scheme.subsets)) == 1: break step += 1 log.info("Greedy algorithm finished after %d steps" % step) log.info("Best scoring scheme is scheme %s, with %s score of %.3f" % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score)) self.cfg.reporter.write_best_scheme(self.results)