Example #1
0
    def do_analysis(self):

        # initialisation steps
        model_selection = the_config.model_selection
        partnum = len(the_config.user_subsets)

        if the_config.cluster_max == -987654321:
            the_config.cluster_max = max([1000, (10 * len(the_config.user_subsets))])
            log.info("Set rcluster-max to %d" %the_config.cluster_max) 

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, the_config.cluster_percent, the_config.cluster_max)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, the_config.cluster_percent, the_config.cluster_max)

        log.info("PartitionFinder will have to analyse %d subsets to"
                 " complete this analyses" % subset_count)
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)



        subsets = [s for s in start_scheme.subsets]
        partnum = len(subsets)
        step = 1
        while True:
            with logtools.indented(log, "*** Relaxed clustering algorithm step %d of up to %d ***"
                % (step, partnum - 1)):

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)
                log.info("Measuring the similarity of %d subset pairs" % max_schemes)
                d_matrix = neighbour.get_distance_matrix(subsets,
                    the_config.cluster_weights)

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01)))
                if cutoff <= 0: cutoff = 1
                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                    cutoff = the_config.cluster_max
                log.info("Choosing the %d most similar subset pairs" % cutoff)
                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse K subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)
                best_scheme = start_scheme

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                median_improvement = np.median(c_matrix[c_matrix<0])

                while best_change <= median_improvement:

                    best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)
                    best_merged = subset_ops.merge_subsets(best_pair)
                    best_scheme = neighbour.make_clustered_scheme(
                        start_scheme, scheme_name, best_pair, best_merged, the_config)
                    start_scheme = best_scheme

                    log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))
                    log.debug("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change)))

                    # reset_c_matrix and the subset list
                    c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                    # we update the subset list in a way that means its structure tracks the c-matrix
                    subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                    best_change = np.amin(c_matrix)

                    if the_config.search == 'rcluster':
                        break
                        # otherwise we are using rclusterf, which continues in this loop
                        # i.e. with rcluster we just take the single best change


                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_result = self.analyse_scheme(best_scheme)
                best_change = self.results.best_score - start_score


                log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
                    len(best_scheme.subsets),
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score


                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)


                if len(set(start_scheme.subsets)) == 1:
                    break

                step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, model_selection,
                    self.results.best_score))

        if the_config.min_subset_size or the_config.all_states:
            best_scheme = self.clean_scheme(self.results.best_scheme)
            best_result = self.analyse_scheme(best_scheme)

            # scores after cleaning can be worse, so we reset these trackers...
            self.results.best_result = best_result
            self.results.best_score = best_result.score
            self.results.best_scheme = best_scheme
            log.info("Best scoring scheme after cleaning is scheme %s, with %s score of %.3f"
                     % (self.results.best_scheme.name, model_selection,
                        self.results.best_score))


        the_config.reporter.write_best_scheme(self.results)
Example #2
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log, "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim*dim)-dim))/2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)


                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s", str(best_change))

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged, the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score


                log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))


                log.info("The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, the_config.model_selection,
                    self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)