Beispiel #1
0
    def clean_scheme(self, start_scheme):
        # Here we look for and fix up subsets that are too small or don't have all states
        keep_going = 1
        merges = 0
        if keep_going == 1:
            with logtools.indented(
                    log,
                    "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***"
                    % start_scheme.name):
                while keep_going > 0:

                    subsets = [s for s in start_scheme.subsets]

                    # sort the subsets, to keep results consistent over re-runs
                    subsets.sort(key=lambda x: 1.0 / float(len(x.columns)))

                    # run through all subsets
                    for i, sub in enumerate(subsets):
                        found = 0
                        state_problems = self.alignment.check_state_probs(
                            sub, the_config)

                        if (len(sub.columns) < the_config.min_subset_size
                                or state_problems == True):

                            # merge that subset with nearest neighbour
                            new_pair = neighbour.get_closest_subset(
                                sub, subsets, the_config)

                            log.info(
                                "Subset '%s' will be merged with subset '%s'" %
                                (new_pair[0].name, new_pair[1].name))
                            new_pair_merged = subset_ops.merge_subsets(
                                new_pair)
                            start_scheme = neighbour.make_clustered_scheme(
                                start_scheme, "cleaned_scheme", new_pair,
                                new_pair_merged, the_config)
                            the_config.progress.begin(1, 1)
                            self.analyse_scheme(start_scheme)
                            subsets = [s for s in start_scheme.subsets]
                            merges = merges + 1
                            found = 1
                            break

                    # if we got to here, there were no subsets to merge
                    if found == 0:
                        keep_going = 0

                    if len(subsets) == 1:
                        log.error(
                            "The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again"
                        )
                        raise AnalysisError

                log.info(
                    "%d subsets merged because of --min-subset-size and/or --all-states settings"
                    % merges)
        return (start_scheme)
Beispiel #2
0
    def clean_scheme(self, start_scheme):
        # Here we look for and fix up subsets that are too small or don't have all states
        keep_going = 1
        merges = 0
        if keep_going == 1:
            with logtools.indented(log, "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***" %start_scheme.name):
                while keep_going > 0:

                    subsets = [s for s in start_scheme.subsets]

                    # sort the subsets, to keep results consistent over re-runs
                    subsets.sort(key = lambda x: 1.0/float(len(x.columns)))

                    # run through all subsets
                    for i, sub in enumerate(subsets):
                        found = 0
                        state_problems = self.alignment.check_state_probs(sub, the_config)

                        if  (
                                len(sub.columns) < the_config.min_subset_size or
                                state_problems == True
                            ):

                            # merge that subset with nearest neighbour
                            new_pair = neighbour.get_closest_subset(sub, subsets, the_config)

                            log.info("Subset '%s' will be merged with subset '%s'" %(new_pair[0].name, new_pair[1].name))
                            new_pair_merged = subset_ops.merge_subsets(new_pair)
                            start_scheme = neighbour.make_clustered_scheme(
                                    start_scheme, "cleaned_scheme", new_pair, new_pair_merged, the_config)
                            the_config.progress.begin(1, 1)
                            self.analyse_scheme(start_scheme)
                            subsets = [s for s in start_scheme.subsets]
                            merges = merges + 1
                            found = 1
                            break


                    # if we got to here, there were no subsets to merge
                    if found == 0:
                        keep_going = 0

                    if len(subsets) == 1:
                        log.error("The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again")
                        raise AnalysisError

                log.info("%d subsets merged because of --min-subset-size and/or --all-states settings" % merges)
        return(start_scheme)
    def do_analysis(self):
        log.info("Performing greediest analysis")

        stop_at = self.cfg.greediest_percent * 0.01

        model_selection = self.cfg.model_selection
        partnum = len(self.cfg.partitions)

        scheme_count = submodels.count_greediest_schemes(partnum, self.cfg.greediest_percent)
        subset_count = submodels.count_greediest_subsets(partnum, self.cfg.greediest_percent)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)
        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)
        self.cfg.reporter.write_scheme_summary(
            self.results.best_scheme, self.results.best_result)


        # Start by remembering that we analysed the starting scheme
        subset_counter = 1
        step = 1
        while True:

            log.info("***Greediest algorithm step %d of %d***" % (step, partnum - 1))
            name_prefix = "step_%d" % (step)

            # Get a list of all possible lumpings of the best_scheme, ordered
            # according to the clustering weights
            lumped_subsets = neighbour.get_ranked_clustered_subsets(
                start_scheme, self.cfg)

            # reduce the size of the lumped subsets to greediest_percent long
            cutoff = int(math.ceil(len(lumped_subsets)*stop_at)) #round up to stop zeros            
            lumped_subsets = lumped_subsets[:cutoff]

            # Now analyse the lumped schemes
            lumpings_done = 0
            old_best_score = self.results.best_score

            for subset_grouping in lumped_subsets:
                scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1)
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)
                
                log.info("Difference in %s: %.1f", self.cfg.model_selection, (new_result.score-old_best_score))
                
                lumpings_done += 1

            
            if self.results.best_score != old_best_score:
                log.info("Analysed %.1f percent of the schemes for this step. The best "
                         "scheme changed the %s score by %.1f units.", 
                         self.cfg.greediest_percent, self.cfg.model_selection,
                         (self.results.best_score - old_best_score))

                #write out the best scheme
                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)


                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info("Analysed %.1f percent of the schemes for this step and found no schemes "
                         "that improve the score, stopping" , self.cfg.greediest_percent)
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break
    
            step += 1



        log.info("Greediest algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, model_selection, self.results.best_score))

        txt = "Best scheme according to Greediest algorithm, analysed with %s" % self.cfg.model_selection
        self.cfg.reporter.write_best_scheme(txt, self.results)
Beispiel #4
0
    def do_analysis(self):

        # initialisation steps
        model_selection = the_config.model_selection
        partnum = len(the_config.user_subsets)

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, the_config.cluster_percent, the_config.cluster_max)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, the_config.cluster_percent, the_config.cluster_max)

        log.info("PartitionFinder will have to analyse %d subsets to"
                 " complete this analyses" % subset_count)
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]
        partnum = len(subsets)
        step = 1
        while True:
            with logtools.indented(
                    log,
                    "*** Relaxed clustering algorithm step %d of up to %d ***"
                    % (step, partnum - 1)):

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)
                log.info("Measuring the similarity of %d subset pairs" %
                         max_schemes)
                d_matrix = neighbour.get_distance_matrix(
                    subsets, the_config.cluster_weights)

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = int(
                    math.ceil(max_schemes *
                              (the_config.cluster_percent * 0.01)))
                if cutoff <= 0: cutoff = 1
                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                    cutoff = the_config.cluster_max
                log.info("Choosing the %d most similar subset pairs" % cutoff)
                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse K subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)
                best_scheme = start_scheme

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                median_improvement = np.median(c_matrix[c_matrix < 0])

                while best_change <= median_improvement:

                    best_pair = neighbour.get_best_pair(
                        c_matrix, best_change, subsets)
                    best_merged = subset_ops.merge_subsets(best_pair)
                    best_scheme = neighbour.make_clustered_scheme(
                        start_scheme, scheme_name, best_pair, best_merged,
                        the_config)
                    start_scheme = best_scheme

                    log.info("Combining subsets: '%s' and '%s'" %
                             (best_pair[0].name, best_pair[1].name))
                    log.info("This improves the %s score by: %s",
                             the_config.model_selection, str(abs(best_change)))

                    # reset_c_matrix and the subset list
                    c_matrix = neighbour.reset_c_matrix(
                        c_matrix, list(best_pair), [best_merged], subsets)

                    # we update the subset list in a way that means its structure tracks the c-matrix
                    subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                      [best_merged])

                    best_change = np.amin(c_matrix)

                    if the_config.search == 'rcluster':
                        break
                        # otherwise we are using rclusterf, which continues in this loop
                        # i.e. with rcluster we just take the single best change

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_result = self.analyse_scheme(best_scheme)
                best_change = self.results.best_score - start_score

                log.info(
                    "The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
                    len(best_scheme.subsets), the_config.model_selection,
                    np.abs(best_change), self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                if len(set(start_scheme.subsets)) == 1:
                    break

                step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, model_selection,
                  self.results.best_score))

        if the_config.min_subset_size or the_config.all_states:
            best_scheme = self.clean_scheme(self.results.best_scheme)
            best_result = self.analyse_scheme(best_scheme)

            # scores after cleaning can be worse, so we reset these trackers...
            self.results.best_result = best_result
            self.results.best_score = best_result.score
            self.results.best_scheme = best_scheme
            log.info(
                "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f"
                % (self.results.best_scheme.name, model_selection,
                   self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Beispiel #5
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log,
                                   "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim * dim) - dim)) / 2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes  # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s",
                          str(best_change))

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change,
                                                    subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged,
                    the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score

                log.info("Best scheme combines subsets: '%s' and '%s'" %
                         (best_pair[0].name, best_pair[1].name))

                log.info(
                    "The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection, np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair),
                                                    [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                  [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, the_config.model_selection,
                  self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Beispiel #6
0
    def do_analysis(self):

        # initialisation steps
        model_selection = the_config.model_selection
        partnum = len(the_config.user_subsets)

        if the_config.cluster_max == -987654321:
            the_config.cluster_max = max([1000, (10 * len(the_config.user_subsets))])
            log.info("Set rcluster-max to %d" %the_config.cluster_max) 

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, the_config.cluster_percent, the_config.cluster_max)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, the_config.cluster_percent, the_config.cluster_max)

        log.info("PartitionFinder will have to analyse %d subsets to"
                 " complete this analyses" % subset_count)
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)



        subsets = [s for s in start_scheme.subsets]
        partnum = len(subsets)
        step = 1
        while True:
            with logtools.indented(log, "*** Relaxed clustering algorithm step %d of up to %d ***"
                % (step, partnum - 1)):

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)
                log.info("Measuring the similarity of %d subset pairs" % max_schemes)
                d_matrix = neighbour.get_distance_matrix(subsets,
                    the_config.cluster_weights)

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = int(math.ceil(max_schemes * (the_config.cluster_percent * 0.01)))
                if cutoff <= 0: cutoff = 1
                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                    cutoff = the_config.cluster_max
                log.info("Choosing the %d most similar subset pairs" % cutoff)
                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse K subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)
                best_scheme = start_scheme

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                median_improvement = np.median(c_matrix[c_matrix<0])

                while best_change <= median_improvement:

                    best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)
                    best_merged = subset_ops.merge_subsets(best_pair)
                    best_scheme = neighbour.make_clustered_scheme(
                        start_scheme, scheme_name, best_pair, best_merged, the_config)
                    start_scheme = best_scheme

                    log.info("Combining subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))
                    log.debug("This improves the %s score by: %s", the_config.model_selection, str(abs(best_change)))

                    # reset_c_matrix and the subset list
                    c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                    # we update the subset list in a way that means its structure tracks the c-matrix
                    subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                    best_change = np.amin(c_matrix)

                    if the_config.search == 'rcluster':
                        break
                        # otherwise we are using rclusterf, which continues in this loop
                        # i.e. with rcluster we just take the single best change


                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_result = self.analyse_scheme(best_scheme)
                best_change = self.results.best_score - start_score


                log.info("The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
                    len(best_scheme.subsets),
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score


                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)


                if len(set(start_scheme.subsets)) == 1:
                    break

                step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, model_selection,
                    self.results.best_score))

        if the_config.min_subset_size or the_config.all_states:
            best_scheme = self.clean_scheme(self.results.best_scheme)
            best_result = self.analyse_scheme(best_scheme)

            # scores after cleaning can be worse, so we reset these trackers...
            self.results.best_result = best_result
            self.results.best_score = best_result.score
            self.results.best_scheme = best_scheme
            log.info("Best scoring scheme after cleaning is scheme %s, with %s score of %.3f"
                     % (self.results.best_scheme.name, model_selection,
                        self.results.best_score))


        the_config.reporter.write_best_scheme(self.results)
Beispiel #7
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(
                the_config, "start_scheme", range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log, "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim*dim)-dim))/2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix, subsets)
                if len(pairs_todo)>0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" %(step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                                start_scheme, scheme_name, pair, pair_merged, the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(c_matrix, sub_tuples, subsets, diffs)


                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s", str(best_change))

                if best_change>=0:
                    log.info("Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change, subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged, the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score


                log.info("Best scheme combines subsets: '%s' and '%s'" %(best_pair[0].name, best_pair[1].name))


                log.info("The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection,
                    np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair), [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair), [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, the_config.model_selection,
                    self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Beispiel #8
0
    def do_analysis(self):
        log.info("Performing relaxed clustering analysis")

        stop_at = self.cfg.cluster_percent * 0.01

        model_selection = self.cfg.model_selection
        partnum = len(self.cfg.partitions)

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, self.cfg.cluster_percent)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, self.cfg.cluster_percent)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        start_description = range(len(self.cfg.partitions))
        start_scheme = scheme.create_scheme(self.cfg, "start_scheme",
                                            start_description)
        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)
        self.cfg.reporter.write_scheme_summary(self.results.best_scheme,
                                               self.results.best_result)

        # Start by remembering that we analysed the starting scheme
        subset_counter = 1
        step = 1
        while True:

            log.info("***Relaxed clustering algorithm step %d of %d***" %
                     (step, partnum - 1))
            name_prefix = "step_%d" % (step)

            # Get a list of all possible lumpings of the best_scheme, ordered
            # according to the clustering weights
            lumped_subsets = neighbour.get_ranked_clustered_subsets(
                start_scheme, self.cfg)

            # reduce the size of the lumped subsets to cluster_percent long
            cutoff = int(math.ceil(len(lumped_subsets) *
                                   stop_at))  #round up to stop zeros
            lumped_subsets = lumped_subsets[:cutoff]

            # Now analyse the lumped schemes
            lumpings_done = 0
            old_best_score = self.results.best_score

            for subset_grouping in lumped_subsets:
                scheme_name = "%s_%d" % (name_prefix, lumpings_done + 1)
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)

                log.debug("Difference in %s: %.1f", self.cfg.model_selection,
                          (new_result.score - old_best_score))

                lumpings_done += 1

            if self.results.best_score != old_best_score:
                log.info(
                    "Analysed %.1f percent of the schemes for this step. The best "
                    "scheme changed the %s score by %.1f units.",
                    self.cfg.cluster_percent, self.cfg.model_selection,
                    (self.results.best_score - old_best_score))

                #write out the best scheme
                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info(
                    "Analysed %.1f percent of the schemes for this step and found no schemes "
                    "that improve the score, stopping",
                    self.cfg.cluster_percent)
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break

            step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, model_selection,
                  self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
    def do_analysis(self):
        # Copied and pasted from greedy analysis
        partnum = len(self.cfg.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)


        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        old_score = self.analyse_scheme(start_scheme)

        # Get first scheme
        best_scheme = start_scheme
        subset_index = 0
        all_subsets = list(best_scheme.subsets)
        processor = self.cfg.processor
        alignment_path = self.filtered_alignment_path
        tree_path = processor.make_tree_path(alignment_path)


        while subset_index < len(all_subsets):
            current_subset = all_subsets[subset_index]
            split_subsets = kmeans.kmeans_split_subset(self.cfg, self.alignment, current_subset, tree_path)

            if split_subsets == 1:
                subset_index += 1

            else:
                # Take a copy
                updated_subsets = all_subsets[:]

                # Replace the current one with the split one
                # Google "slice assignments"
                # This list is the key to avoiding recursion. It expands to contain
                # all of the split subsets by replacing them with the split ones
                updated_subsets[subset_index:subset_index+1] = split_subsets

                test_scheme = scheme.Scheme(self.cfg, "Current Scheme", updated_subsets)

                try:
                    best_result = self.analyse_scheme(best_scheme)
                    new_result = self.analyse_scheme(test_scheme)

                    log.info("Current best score is: " + str(best_result))
                    log.info("Current new score is: " + str(new_result))
                    if new_result.score < best_result.score:
                        log.info("New score " + str(subset_index) + " is better and will be set to best score")
                        best_scheme = test_scheme

                        # Change this to the one with split subsets in it. Note that
                        # the subset_index now points a NEW subset, one that was split
                        all_subsets = updated_subsets
                    else:
                        # Move to the next subset in the all_subsets list
                        subset_index += 1

                # In PhyML or RAxML, it is likely because of no alignment patterns,
                # catch that and move to the next subset without splitting.
                except PhylogenyProgramError:
                    log.info("Phylogeny program generated an error so this subset was not split, see error above")
                    subset_index += 1
        # Now start the Greedy Analysis: need to figure out how to make it go through more
        # than one scheme...

        start_scheme = best_scheme
        partnum = len(start_scheme.subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)
        self.cfg.progress.begin(scheme_count, subset_count)
        start_description = range(partnum)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            old_best_score = self.results.best_score

            # Get an iterable of all possible pairs of subsets in best_scheme
            lumped_subsets = itertools.combinations(start_scheme.subsets, 2)

            for subset_grouping in lumped_subsets:
                scheme_name = cur_s
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)

                log.debug("Difference in %s: %.1f",
                          self.cfg.model_selection,
                          (new_result.score-old_best_score))

                cur_s += 1

            if self.results.best_score != old_best_score:
                log.info("Analysed all schemes for this step. The best "
                         "scheme changed the %s score by %.1f units.",
                         self.cfg.model_selection,
                         (self.results.best_score - old_best_score))

                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info("Analysed all schemes for this step and found no schemes "
                         "that improve the score, stopping")
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break

            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        log.info("Performing greedy analysis")

        partnum = len(self.cfg.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        self.cfg.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            self.cfg, "start_scheme", start_description)

        log.info("Analysing starting scheme (scheme %s)" % start_scheme.name)
        self.analyse_scheme(start_scheme)

        step = 1
        cur_s = 2

        # Now we try out all lumpings of the current scheme, to see if we can
        # find a better one and if we do, we just keep going
        while True:
            log.info("***Greedy algorithm step %d***" % step)

            old_best_score = self.results.best_score

            # Get an iterable of all possible pairs of subsets in best_scheme
            lumped_subsets = itertools.combinations(start_scheme.subsets, 2)

            for subset_grouping in lumped_subsets:
                scheme_name = cur_s
                lumped_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, subset_grouping, self.cfg)

                new_result = self.analyse_scheme(lumped_scheme)

                log.debug("Difference in %s: %.1f",
                          self.cfg.model_selection,
                          (new_result.score-old_best_score))

                cur_s += 1

            if self.results.best_score != old_best_score:
                log.info("Analysed all schemes for this step. The best "
                         "scheme changed the %s score by %.1f units.",
                         self.cfg.model_selection,
                         (self.results.best_score - old_best_score))

                self.results.best_scheme.name = "step_%d" % step
                self.cfg.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

                # Now we find out which is the best lumping we know of for this step
                start_scheme = self.results.best_scheme
            else:
                log.info("Analysed all schemes for this step and found no schemes "
                         "that improve the score, stopping")
                break

            # We're done if it's the scheme with everything together
            if len(set(lumped_scheme.subsets)) == 1:
                break

            step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f"
                 % (self.results.best_scheme.name, self.cfg.model_selection, self.results.best_score))

        self.cfg.reporter.write_best_scheme(self.results)