Example #1
0
    def one_kmeans_step(self, start_subsets, step, tree_path):
        name_prefix = "step_%d" % (step)

        # 1. Make split subsets

        with logtools.indented(log, "Splitting subsets using k-means"):
            split_subs = self.split_subsets(start_subsets, tree_path)

            # 2. Analyse split subsets (this to take advantage of parallelisation)
            subs = []

            # make a list from the dictionary
            for vals in split_subs.values():
                subs.extend(vals)

        log.debug("%d subsets successfully split" %
                  (len(subs) - len(start_subsets)))

        with logtools.indented(
                log,
                "Calculating scores of all new subsets that can be analysed"):
            self.analyse_list_of_subsets(subs)

            # 3. Build new list of subsets
            new_scheme_subs = self.build_new_subset_list(
                name_prefix, split_subs, start_subsets)

        # 4. Are we done yet?
        if len(new_scheme_subs) == len(list(start_subsets)):
            log.info(
                """Could not improve %s score. Kmeans algorithm finished.""" %
                (the_config.model_selection))
            done = True
        else:
            n_splits = len(new_scheme_subs) - len(start_subsets)

            if n_splits > 1:
                t = 'subsets'
            else:
                t = 'subset'
            log.info("""The %s score of %d %s
                     improved when split""" %
                     (the_config.model_selection, n_splits, t))

            start_subsets = new_scheme_subs

            done = False

        return done, start_subsets
Example #2
0
    def clean_scheme(self, start_scheme):
        # Here we look for and fix up subsets that are too small or don't have all states
        keep_going = 1
        merges = 0
        if keep_going == 1:
            with logtools.indented(
                    log,
                    "*** Checking subsets from scheme '%s' meet --min-subset-size and --all_states settings ***"
                    % start_scheme.name):
                while keep_going > 0:

                    subsets = [s for s in start_scheme.subsets]

                    # sort the subsets, to keep results consistent over re-runs
                    subsets.sort(key=lambda x: 1.0 / float(len(x.columns)))

                    # run through all subsets
                    for i, sub in enumerate(subsets):
                        found = 0
                        state_problems = self.alignment.check_state_probs(
                            sub, the_config)

                        if (len(sub.columns) < the_config.min_subset_size
                                or state_problems == True):

                            # merge that subset with nearest neighbour
                            new_pair = neighbour.get_closest_subset(
                                sub, subsets, the_config)

                            log.info(
                                "Subset '%s' will be merged with subset '%s'" %
                                (new_pair[0].name, new_pair[1].name))
                            new_pair_merged = subset_ops.merge_subsets(
                                new_pair)
                            start_scheme = neighbour.make_clustered_scheme(
                                start_scheme, "cleaned_scheme", new_pair,
                                new_pair_merged, the_config)
                            the_config.progress.begin(1, 1)
                            self.analyse_scheme(start_scheme)
                            subsets = [s for s in start_scheme.subsets]
                            merges = merges + 1
                            found = 1
                            break

                    # if we got to here, there were no subsets to merge
                    if found == 0:
                        keep_going = 0

                    if len(subsets) == 1:
                        log.error(
                            "The settings you have used for --all-states and/or --min-subset-size mean that all of your subsets have been merged into one prior to any analysis. Thus, no analysis is necessary. Please check and try again"
                        )
                        raise AnalysisError

                log.info(
                    "%d subsets merged because of --min-subset-size and/or --all-states settings"
                    % merges)
        return (start_scheme)
    def do_analysis(self):
        '''A kmeans algorithm for heuristic partitioning searches'''

        start_result, start_scheme, tree_path = self.setup()

        step = 0

        start_subsets = list(
            start_scheme.subsets)  # we only work on lists of subsets

        self.analyse_list_of_subsets(start_subsets)

        # now we suppress ExternalProgramError for the rest of the algorithm
        the_config.suppress_errors = True

        for s in start_subsets:
            if s.fabricated:
                log.error("""One or more of your starting datablocks could not
                          be analysed. Please check your data and try again.
                          One way to fix this is to join your small datablocks
                          together into larger datablocks""")
                raise AnalysisError

        while True:
            step += 1
            with logtools.indented(log,
                                   "***k-means algorithm step %d***" % step):
                done, start_subsets = self.one_kmeans_step(
                    start_subsets, step, tree_path)

            if done:
                break

        # Ok, we're done. we just need deal with fabricated subsets
        final_scheme = self.finalise_fabrication(start_subsets, step)

        # Finally, for krmeans, we put the invariant sites back with their
        # nearest variable neighbours
        if the_config.search == 'krmeans':
            log.info("Reassigning invariant sites for krmeans algorithm")
            # the definition of krmeans is that we reassign the zero entropies
            final_subsets = self.reassign_invariant_sites(final_scheme.subsets)
            final_scheme = scheme.Scheme(the_config, "final_scheme_reassigned",
                                         final_subsets)

        log.info("Analysing final scheme")

        final_result = self.analyse_scheme(final_scheme)

        self.report(step)

        if not the_config.quick:
            the_config.reporter.write_scheme_summary(final_scheme,
                                                     final_result)

        return (final_scheme)
    def setup(self):

        log.warning(
            "Warning as of April 2016: We have noticed that the kmeans \
            algorithm does not perform well on some simulated datasets. \
            We are working on investigating and addressing this \
            but in the mean time we suggest being very cautious about using \
            this algorithm. At the very least, you should try other approaches \
            (e.g. partitioning by locus), and investigate your answers carefully \
            (both the trees and the partitioning schemes). If you have any \
            questions, please get in touch on the google group. Note that this \
            warning does not apply to cases where you are using models that have \
            an ascertainment bias for datasets that include only variable sites \
            as is often the case with morphological analyses.")

        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                            start_description)

        site_max = sum([len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." % (the_config.min_subset_size, site_max))
            raise AnalysisError

        with logtools.indented(
                log, "**Analysing starting scheme (scheme %s)**" %
                start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology':
            log.error("You have selected kmeans and tiger \
                rates. This is an unsupported option for anything except \
                morphological data. The kmeans algorithm \
                now works with entropies, not TIGER rates.")
            raise AnalysisError

        return start_result, start_scheme, tree_path
    def do_analysis(self):
        '''A kmeans algorithm for heuristic partitioning searches'''

        start_result, start_scheme, tree_path = self.setup()

        step = 0

        start_subsets = list(
            start_scheme.subsets)  # we only work on lists of subsets

        self.analyse_list_of_subsets(start_subsets)

        # now we suppress ExternalProgramError for the rest of the algorithm
        the_config.suppress_errors = True

        for s in start_subsets:
            if s.fabricated:
                log.error("""One or more of your starting datablocks could not
                          be analysed. Please check your data and try again.
                          One way to fix this is to join your small datablocks
                          together into larger datablocks""")
                raise AnalysisError

        while True:
            step += 1
            with logtools.indented(log,
                                   "***k-means algorithm step %d***" % step):
                done, start_subsets = self.one_kmeans_step(
                    start_subsets, step, tree_path)

            if done:
                break

        # Ok, we're done. we just need deal with fabricated subsets
        final_scheme = self.finalise_fabrication(start_subsets, step)

        log.info("Analysing final scheme")
        final_result = self.analyse_scheme(final_scheme)

        self.report(step)

        return (final_scheme)
Example #6
0
    def setup(self):

        log.warning(
            "Warning as of April 2016: We have noticed that the kmeans \
            algorithm does not perform well on some simulated datasets. \
            We are working on investigating and addressing this \
            but in the mean time we suggest being very cautious about using \
            this algorithm. At the very least, you should try other approaches \
            (e.g. partitioning by locus), and investigate your answers carefully \
            (both the trees and the partitioning schemes). If you have any \
            questions, please get in touch on the google group.")

        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                            start_description)

        if len(start_scheme.subsets) > 1:
            log.error("The k-means algorithm is designed to analyse \
                the entire alignment at once. To use it, please define a \
                single data block that includes all of your sites, and \
                again.")
            raise AnalysisError

        site_max = sum([len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." % (the_config.min_subset_size, site_max))
            raise AnalysisError

        with logtools.indented(
                log, "**Analysing starting scheme (scheme %s)**" %
                start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology':
            try:
                from _tiger import TigerDNA
                the_config.TigerDNA = TigerDNA
            except:
                log.error("Couldn't find compiled tiger code.")
                log.error("You have selected kmeans and tiger \
                    rates. This is an unsupported option, if you still wish to use \
                    this option, you must compile the tiger code.")
                log.error(
                    "Once you compile the tiger code, this option will work. \
                    But please note that this is an \
                    unsupported option. For empirical work we recommend using \
                    entropy calculations for site rates, which is the default \
                    behaviour for the kmeans algorithm in PF2.")
                raise AnalysisError
        else:
            the_config.TigerDNA = None

        return start_result, start_scheme, tree_path
Example #7
0
    def finalise_fabrication(self, start_subsets, step):

        fabricated_subsets = []
        for s in start_subsets:

            # here we put a sensible lower limit on the size of subsets
            if len(s.columns) < the_config.min_subset_size:
                s.fabricated = True
                log.debug("Subset %s with only %d sites found" %
                          (s.subset_id, len(s.columns)))

            # here we can test if the alignment has all states:
            state_probs = self.alignment.check_state_probs(s, the_config)
            if state_probs:
                s.fabricated = True
                log.debug(
                    "Subset %s does not have all states in the alignment",
                    s.subset_id)

            if s.fabricated:
                fabricated_subsets.append(s)
                log.debug("added %s to fabricated subset", s.name)

        if fabricated_subsets:
            with logtools.indented(log, "Finalising partitioning scheme"):
                log.debug("There are %d/%d fabricated subsets" %
                          (len(fabricated_subsets), len(start_subsets)))

                i = 1
                while fabricated_subsets:

                    all_subs = start_subsets

                    # occasionally subsets with all value == 0.0 are given a
                    # centroid of None by scikit-learn. The true entropy here
                    # is 0.0 for all sites, so the true centroid is 0.0
                    for s in all_subs:
                        if s.centroid == None:
                            s.centroid = [0.0]
                            log.debug("Fixed a subset with a centroid of None")
                            log.debug("The subset has %d columns" %
                                      len(s.columns))

                    s = fabricated_subsets.pop(0)

                    log.debug("Working on fabricated subset %s with %d sites" %
                              (s.subset_id, len(s.columns)))
                    log.info("Finalising subset %d", i)
                    i = i + 1

                    all_subs.remove(s)

                    centroid = s.centroid

                    best_match = None

                    # get closest subset to s
                    for sub in all_subs:

                        centroid_array = [sub.centroid, centroid]

                        euclid_dist = spatial.distance.pdist(centroid_array)

                        if euclid_dist < best_match or best_match is None:
                            best_match = euclid_dist
                            closest_sub = sub

                    # join s with closest_sub to make joined_sub
                    merged_sub = subset_ops.merge_subsets([s, closest_sub])

                    # remove closest sub
                    all_subs.remove(closest_sub)

                    # and if closest_sub was fabricated too, we remove it here
                    if fabricated_subsets.count(closest_sub):
                        fabricated_subsets.remove(closest_sub)

                    # analyse joined sub
                    self.analyse_list_of_subsets([merged_sub])

                    # here we put a sensible lower limit on the size of subsets
                    if len(merged_sub.columns) < the_config.min_subset_size:
                        merged_sub.fabricated = True

                    # if joined has to be fabricated, add to fabricated list
                    if merged_sub.fabricated:
                        fabricated_subsets.append(merged_sub)

                    all_subs.append(merged_sub)
        else:
            all_subs = start_subsets

        # now build a scheme from start_subs, and it should work
        final_scheme = scheme.Scheme(the_config, "final_scheme", all_subs)

        # return final scheme
        return final_scheme
Example #8
0
    def do_analysis(self):

        # initialisation steps
        model_selection = the_config.model_selection
        partnum = len(the_config.user_subsets)

        scheme_count = submodels.count_relaxed_clustering_schemes(
            partnum, the_config.cluster_percent, the_config.cluster_max)
        subset_count = submodels.count_relaxed_clustering_subsets(
            partnum, the_config.cluster_percent, the_config.cluster_max)

        log.info("PartitionFinder will have to analyse %d subsets to"
                 " complete this analyses" % subset_count)
        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]
        partnum = len(subsets)
        step = 1
        while True:
            with logtools.indented(
                    log,
                    "*** Relaxed clustering algorithm step %d of up to %d ***"
                    % (step, partnum - 1)):

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)
                log.info("Measuring the similarity of %d subset pairs" %
                         max_schemes)
                d_matrix = neighbour.get_distance_matrix(
                    subsets, the_config.cluster_weights)

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = int(
                    math.ceil(max_schemes *
                              (the_config.cluster_percent * 0.01)))
                if cutoff <= 0: cutoff = 1
                if the_config.cluster_max != None and cutoff > the_config.cluster_max:
                    cutoff = the_config.cluster_max
                log.info("Choosing the %d most similar subset pairs" % cutoff)
                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse K subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)
                best_scheme = start_scheme

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                median_improvement = np.median(c_matrix[c_matrix < 0])

                while best_change <= median_improvement:

                    best_pair = neighbour.get_best_pair(
                        c_matrix, best_change, subsets)
                    best_merged = subset_ops.merge_subsets(best_pair)
                    best_scheme = neighbour.make_clustered_scheme(
                        start_scheme, scheme_name, best_pair, best_merged,
                        the_config)
                    start_scheme = best_scheme

                    log.info("Combining subsets: '%s' and '%s'" %
                             (best_pair[0].name, best_pair[1].name))
                    log.info("This improves the %s score by: %s",
                             the_config.model_selection, str(abs(best_change)))

                    # reset_c_matrix and the subset list
                    c_matrix = neighbour.reset_c_matrix(
                        c_matrix, list(best_pair), [best_merged], subsets)

                    # we update the subset list in a way that means its structure tracks the c-matrix
                    subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                      [best_merged])

                    best_change = np.amin(c_matrix)

                    if the_config.search == 'rcluster':
                        break
                        # otherwise we are using rclusterf, which continues in this loop
                        # i.e. with rcluster we just take the single best change

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_result = self.analyse_scheme(best_scheme)
                best_change = self.results.best_score - start_score

                log.info(
                    "The best scheme has %d subsets and improves the %s score by %.2f to %.1f",
                    len(best_scheme.subsets), the_config.model_selection,
                    np.abs(best_change), self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                if len(set(start_scheme.subsets)) == 1:
                    break

                step += 1

        log.info("Relaxed clustering algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, model_selection,
                  self.results.best_score))

        if the_config.min_subset_size or the_config.all_states:
            best_scheme = self.clean_scheme(self.results.best_scheme)
            best_result = self.analyse_scheme(best_scheme)

            # scores after cleaning can be worse, so we reset these trackers...
            self.results.best_result = best_result
            self.results.best_score = best_result.score
            self.results.best_scheme = best_scheme
            log.info(
                "Best scoring scheme after cleaning is scheme %s, with %s score of %.3f"
                % (self.results.best_scheme.name, model_selection,
                   self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Example #9
0
    def do_analysis(self):
        '''A greedy algorithm for heuristic partitioning searches'''

        partnum = len(the_config.user_subsets)
        scheme_count = submodels.count_greedy_schemes(partnum)
        subset_count = submodels.count_greedy_subsets(partnum)

        the_config.progress.begin(scheme_count, subset_count)

        # Start with the most partitioned scheme, and record it.
        with logtools.indented(log, "*** Analysing starting scheme ***"):
            the_config.progress.begin(scheme_count, partnum)
            start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                                range(partnum))
            start_result = self.analyse_scheme(start_scheme)
            start_score = start_result.score
            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    self.results.best_scheme, self.results.best_result)

        subsets = [s for s in start_scheme.subsets]

        step = 1
        while len(set(start_scheme.subsets)) > 1:
            with logtools.indented(log,
                                   "***Greedy algorithm step %d***" % step):
                name_prefix = "step_%d" % (step)

                # get distances between subsets
                max_schemes = comb(len(start_scheme.subsets), 2)

                # this is a fake distance matrix, so that the greedy algorithm
                # can use all the tricks of the relaxed clustering algorithm
                dim = len(subsets)
                d_matrix = np.zeros((((dim * dim) - dim)) / 2)
                d_matrix[:] = np.inf

                if step == 1:
                    # Now initialise a change in info score matrix to inf
                    c_matrix = np.empty(d_matrix.shape)
                    c_matrix[:] = np.inf
                    c_matrix = spatial.distance.squareform(c_matrix)

                # 1. pick top N subset pairs from distance matrix
                cutoff = max_schemes  # this defines the greedy algorithm: we look at all schemes

                closest_pairs = neighbour.get_N_closest_subsets(
                    subsets, the_config, cutoff, d_matrix)

                # 2. analyse subsets in top N that have not yet been analysed
                pairs_todo = neighbour.get_pairs_todo(closest_pairs, c_matrix,
                                                      subsets)
                if len(pairs_todo) > 0:
                    log.info("Analysing %d new subset pairs" % len(pairs_todo))
                    new_subs = []
                    sub_tuples = []
                    for pair in pairs_todo:
                        new_sub = subset_ops.merge_subsets(pair)
                        new_subs.append(new_sub)
                        sub_tuples.append((new_sub, pair))

                    the_config.progress.begin(scheme_count, len(new_subs))
                    self.analyse_list_of_subsets(new_subs)

                    # 3. for all K new subsets, update improvement matrix and find best pair
                    log.info("Finding the best partitioning scheme")
                    diffs = []
                    scheme_name = "step_%d" % (step)
                    for t in sub_tuples:
                        pair_merged = t[0]
                        pair = t[1]
                        new_scheme = neighbour.make_clustered_scheme(
                            start_scheme, scheme_name, pair, pair_merged,
                            the_config)
                        r = self.analyse_scheme(new_scheme)
                        diff = r.score - start_score
                        diffs.append(diff)

                    c_matrix = neighbour.update_c_matrix(
                        c_matrix, sub_tuples, subsets, diffs)

                # 4. Find the best pair of subsets, and build a scheme based on that
                # note that this matrix includes diagonals, which will all be zero
                # since this is equivalent to comparing a scheme to itself.
                # so we need to be careful to only proceed if we have a negative change
                # which indicates an improvement in the score
                best_change = np.amin(c_matrix)

                log.debug("Biggest improvement in info score: %s",
                          str(best_change))

                if best_change >= 0:
                    log.info(
                        "Found no schemes that improve the score, stopping")
                    break

                best_pair = neighbour.get_best_pair(c_matrix, best_change,
                                                    subsets)

                best_merged = subset_ops.merge_subsets(best_pair)
                best_scheme = neighbour.make_clustered_scheme(
                    start_scheme, scheme_name, best_pair, best_merged,
                    the_config)
                best_result = self.analyse_scheme(best_scheme)

                # the best change can get updated a fraction at this point
                # because calaculting the info score on the whole alignment
                # is a little different from doing it on the one subset
                best_change = self.results.best_score - start_score

                log.info("Best scheme combines subsets: '%s' and '%s'" %
                         (best_pair[0].name, best_pair[1].name))

                log.info(
                    "The best scheme improves the %s score by %.2f to %.1f",
                    the_config.model_selection, np.abs(best_change),
                    self.results.best_score)
                start_scheme = best_scheme
                start_score = best_result.score

                log.debug("Best pair: %s", str([s.name for s in best_pair]))
                log.debug("Merged into: %s", str([best_merged.name]))

                # 5. reset_c_matrix and the subset list
                c_matrix = neighbour.reset_c_matrix(c_matrix, list(best_pair),
                                                    [best_merged], subsets)

                # we updated the subset list in a special way, which matches how we update the c matrix:
                subsets = neighbour.reset_subsets(subsets, list(best_pair),
                                                  [best_merged])

                if not the_config.quick:
                    the_config.reporter.write_scheme_summary(
                        best_scheme, best_result)

                step += 1

        log.info("Greedy algorithm finished after %d steps" % step)
        log.info("Best scoring scheme is scheme %s, with %s score of %.3f" %
                 (self.results.best_scheme.name, the_config.model_selection,
                  self.results.best_score))

        the_config.reporter.write_best_scheme(self.results)
Example #10
0
    def setup(self):

        if the_config.datatype != 'morphology':
            log.warning("METHOD DISCONTINUED: \
                There is increasing evidence that the kmeans \
                algorithm can lead to poor inferences, so we have \
                discontinued its use for most data types. \
                You should instead use other approaches \
                (e.g. partitioning by locus and codon position). If you have any \
                questions, please get in touch on the google group. More \
                information on the empirical issues \
                can be found in this paper: \
                http://www.sciencedirect.com/science/article/pii/S1055790316302780."
                )
            raise AnalysisError
        else:
            log.warning("USE CAUTION: \
                There is increasing evidence that the kmeans \
                algorithm can lead to poor inferences, so we have \
                discontinued its use for most data types \
                (i.e. amino acid and nucleotide data). \
                More information on the empirical issues \
                can be found in this paper: \
                http://www.sciencedirect.com/science/article/pii/S1055790316302780. \
                We have kept the method available for morphological \
                data, but warn users that the method is: experimental, \
                untested on morphological data (either empirical or \
                simulated), and may give incorrect topologies and branch \
                lengths (see link to paper above)." 
                )


        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(
            the_config, "start_scheme", start_description)

        site_max = sum([ len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." %(the_config.min_subset_size, site_max)
                )
            raise AnalysisError


        with logtools.indented(log, "**Analysing starting scheme (scheme %s)**" % start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger' and the_config.datatype != 'morphology':
            log.error("You have selected kmeans and tiger \
                rates. This is an unsupported option for anything except \
                morphological data. The kmeans algorithm \
                now works with entropies, not TIGER rates.")
            raise AnalysisError

        return start_result, start_scheme, tree_path
    def setup(self):

        # set the default subset size to 100 for kmeans analyses
        if the_config.min_subset_size == False:
            the_config.min_subset_size = 100

        partnum = len(the_config.user_subsets)
        the_config.progress.begin(1, 1)

        # Start with the most partitioned scheme
        start_description = range(partnum)
        start_scheme = scheme.create_scheme(the_config, "start_scheme",
                                            start_description)

        if len(start_scheme.subsets) > 1:
            log.error("The k-means algorithm is designed to analyse \
                the entire alignment at once. To use it, please define a \
                single data block that includes all of your sites, and \
                again.")
            raise AnalysisError

        site_max = sum([len(s.columns) for s in start_scheme.subsets])

        if the_config.min_subset_size > site_max:
            log.error("The minimum subset size must be smaller than the \
                total number of sites you want to analyse. Your minimum \
                subset size is %d, and your alignment is %d sites. Please \
                check and try again." % (the_config.min_subset_size, site_max))
            raise AnalysisError

        with logtools.indented(
                log, "**Analysing starting scheme (scheme %s)**" %
                start_scheme.name):
            start_result = self.analyse_scheme(start_scheme)

            if not the_config.quick:
                the_config.reporter.write_scheme_summary(
                    start_scheme, start_result)

            tree_path = the_config.processor.make_tree_path(
                self.filtered_alignment_path)

        if the_config.kmeans == 'tiger':
            try:
                from _tiger import TigerDNA
                the_config.TigerDNA = TigerDNA
            except:
                log.error("Couldn't find compiled tiger code.")
                log.error("You have selected kmeans and tiger \
                    rates. This is an unsupported option, if you still wish to use \
                    this option, you must compile the tiger code.")
                log.error(
                    "Once you compile the tiger code, this option will work. \
                    But please note that this is an \
                    unsupported option. For empirical work we recommend using \
                    entropy calculations for site rates, which is the default \
                    behaviour for the kmeans algorithm in PF2.")
                raise AnalysisError
        else:
            the_config.TigerDNA = None

        return start_result, start_scheme, tree_path