Exemple #1
0
    def __init__(self,
                 nclusters,
                 collection,
                 tmpdir='/tmp',
                 initial_assignment=None):
        self.Collection = collection

        if not self.Collection.records[0].tree:
            print 'Calculating NJ trees for collection...'
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        self.Scorer = Scorer(self.Collection.records,
                             analysis='nj',
                             datatype=self.datatype,
                             tmpdir=tmpdir)

        if initial_assignment is None:
            initial_assignment = self.random_partition(nclusters)

        self.nclusters = nclusters
        self.tmpdir = tmpdir
        print 'Calculating initial scores...'
        self.global_best_score = self.Scorer.score(initial_assignment)
        self.global_best_assignment = initial_assignment
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0
Exemple #2
0
    def __init__(self, nclusters, collection, tmpdir='/tmp',
                 initial_assignment=None):
        self.Collection = collection

        if not self.Collection.records[0].tree:
            print 'Calculating NJ trees for collection...'
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        self.Scorer = Scorer(self.Collection.records, analysis='nj',
                             datatype=self.datatype,
                             tmpdir=tmpdir)

        if initial_assignment is None:
            initial_assignment = self.random_partition(nclusters)

        self.nclusters = nclusters
        self.tmpdir = tmpdir
        print 'Calculating initial scores...'
        self.global_best_score = self.Scorer.score(initial_assignment)
        self.global_best_assignment = initial_assignment
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0
Exemple #3
0
class Optimiser(object):

    def __init__(self, nclusters, collection, tmpdir='/tmp',
                 initial_assignment=None):
        self.Collection = collection

        if not self.Collection.records[0].tree:
            print 'Calculating NJ trees for collection...'
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        self.Scorer = Scorer(self.Collection.records, analysis='nj',
                             datatype=self.datatype,
                             tmpdir=tmpdir)

        if initial_assignment is None:
            initial_assignment = self.random_partition(nclusters)

        self.nclusters = nclusters
        self.tmpdir = tmpdir
        print 'Calculating initial scores...'
        self.global_best_score = self.Scorer.score(initial_assignment)
        self.global_best_assignment = initial_assignment
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0

    def _reset_counts(self):
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0

    def _status(self):
        return '{0} {1} {2}'.format(self.i, self.global_best_score,
                                    self.global_best_assignment)

    def random_partition(self, nclusters):
        return Partition(tuple(np.random.randint(nclusters,
                         size=len(self.Collection))))

    def update(self, assignment):
        """
        method for working interactively and keeping nclusters correct
        """
        self.global_best_assignment = assignment
        self.global_best_score = self.Scorer.score(assignment)
        self.nclusters = max(assignment.partition_vector)

    def get_clusters(self, assignment):
        pvec = assignment.partition_vector
        index_dict = defaultdict(list)
        for (position, value) in enumerate(pvec):
            index_dict[value].append(position)
        return index_dict

    def get_cluster_trees(self, assignment, index_dict=None):
        index_dict = (index_dict or self.get_clusters(assignment))
        tree_dict = {}
        for (k, v) in index_dict.items():
            if not tuple(v) in self.Scorer.concats:
                self.Scorer.add(tuple(v))
            tree_dict[k] = self.Scorer.concats[tuple(v)]
        return tree_dict

    def score_sample(self, sample, assignment):
        """
        !! changed to simply SCORE a PRE-MADE SAMPLE
        sample_size:int, assignment:Partition object
        Calculates score m*n score matrix, where m is number of alignments
        in the sample, and n in the number of clusters encoded in the
        assignment (==Partition object)
        """
        # sample = random.sample(range(len(self.Collection)), sample_size)
        cluster_trees = self.get_cluster_trees(assignment)
        scores = np.zeros((len(sample), len(cluster_trees)))
        for i, record_index in enumerate(sample):
            rec = self.Collection.records[record_index]
            for j, tree in cluster_trees.items():
                scores[i, j-1] = self.test(rec, tree)
        return (scores)

    def make_new_assignment(self, sample, scores, assignment, nreassign=1, choose='max'):
        """
        MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS
        """

        new_clusters = scores.argmax(axis=1)
        M = scores/scores.sum(axis=1)[:, np.newaxis]
        if choose == 'max':
            reassignments = M.max(axis=1).argsort()[-nreassign:]
        elif choose == 'min':
            reassignments = M.min(axis=1).argsort()[:nreassign]

        new_assignment = list(assignment.partition_vector)

        for i in reassignments:
            new_assignment[sample[i]] = new_clusters[i]+1  # because cluster number is in range
                                            # [1,x], and new_clusters is in range [0,x-1]
        return Partition(tuple(new_assignment))

    def move(self, sample_size, assignment, nreassign=1, choose='max', sampled=[]):
        """
        !! now generates own sample and passes to scores
        wraps self.score_sample + self.new_assignment
        """
        unsampled = set(range(len(self.Collection))) - set(sampled)

        if len(unsampled) > 0:
            if sample_size > len(unsampled):
                sample = unsampled
            else:
                sample = random.sample(unsampled, sample_size)

            self.sampled.extend(sample)
            scores = self.score_sample(sample, assignment)
            assignment = self.make_new_assignment(sample, scores, assignment,
                                                  nreassign, choose)
        return assignment

    def merge(self, assignment, label1, label2):
        pvec = ((x if x != label1 else label2)
                for x in assignment.partition_vector)
        return Partition(tuple(pvec))

    def merge_closest(self, assignment):
        print 'Finding clusters to merge...'
        clusters = self.get_clusters(assignment)
        best_score = -np.inf

        for i in clusters:
            for j in clusters:
                # print 'i = {}, j = {}'.format(i, j)
                if i == j:
                    continue
                print 'Testing Clusters {0} and {1}'.format(i, j)
                test_assignment = self.merge(assignment, i, j)
                score = self.Scorer.score(test_assignment)

                if score > best_score:
                    best_score = score
                    best_assignment = test_assignment

        print 'Best assignment: {0}'.format(best_assignment)
        return(best_assignment)

    def split(self, k, assignment):
        """
        Function to split cluster based on least representative alignment
        """
        print assignment
        members = self.get_clusters(assignment)[k]
        tree = self.get_cluster_trees(assignment)[k]
        alignment_scores = {}
        print 'Calculating alignment scores...'

        for i in members:
            r = self.Collection.records[i]
            alignment_scores[i] = self.test(r, tree)

        seed, min_score = min(alignment_scores.iteritems(), key=operator.itemgetter(1))
        print 'Splitting on {0}.'.format(seed)

        new_assignment = list(assignment.partition_vector)
        new_assignment[seed] = max(assignment.partition_vector) + 1
        print 'New Partition: {0}'.format(new_assignment)
        print 'Assigning to new partition...'

        new_assignment = Partition(new_assignment)
        scores = self.score_sample(members, new_assignment)
        assignment = self.make_new_assignment(members, scores, new_assignment, nreassign=len(members))
        print 'Returning: {0}'.format(assignment)

        return assignment

    def split_max_var(self, assignment):
        clusters = self.get_clusters(assignment)
        var_dict = {}

        for k in clusters.keys():
            var_dict[k] = self.var(clusters[k])

        print var_dict

        cluster_to_split, var = max(clusters.iteritems(), key=operator.itemgetter(1))

    def split_search(self, assignment):
        clusters = self.get_clusters(assignment)
        k = max(assignment.partition_vector)
        best_score = -np.Inf

        for i in clusters:
            print 'i: {0}'.format(i)
            test_assignment = self.split(i, assignment)
            score = self.Scorer.score(test_assignment)
            if max(test_assignment.partition_vector) == k + 1:
                score = self.Scorer.score(test_assignment)
            else:
                score = -np.Inf
                print 'Something has gone wrong'
            print test_assignment
            print score

            if score > best_score:
                best_score = score
                best_assignment = test_assignment
                    # print 'New High Watermark'

        return best_assignment

    def test(self, record, tree, model='WAG'):
        """
        TESTS AN ALIGNMENT AGAINST A TREE TOPOLOGY
        """
        alignment_file = record.write_phylip('{0}/tmp_alignment.phy'.format(
            self.tmpdir), interleaved=True)
        newick_file = tree.write_to_file('{0}/tmp_tree.nwk'.format(self.tmpdir))
        p = Phyml(record)
        p.add_tempfile(alignment_file)
        p.add_tempfile(newick_file)
        p.add_flag('-i', alignment_file)
        p.add_flag('-u', newick_file)
        p.add_flag('-b', '0')    # no bootstraps
        p.add_flag('-m', model)  # evolutionary model
        p.add_flag('-o', 'n')    # no optimisation
        p.add_flag('-d', 'aa')   # datatype
        return p.run().score

    def var(self, members):
        score = self.Scorer.add(tuple(members)).score
        records = [self.Collection.records[i] for i in members]
        total_length = sum([r.seqlength for r in records])

        return(score / total_length)

    def optimise(self,
                 assignment,
                 update=True,
                 history=True,
                 sample_size=10,
                 nreassign=10,
                 max_stayed_put=5,
                 max_resets=5,
                 max_done_worse=5,
                 max_iter=1000):

        local_best_assignment = assignment
        local_best_score = self.Scorer.score(local_best_assignment, history=history)
        current_assignment = local_best_assignment
        self.sampled = []

        print 'Optimising: {0} {1} {2}'.format(self.i, local_best_score, current_assignment)

        while True:
            if self.stayed_put > max_stayed_put:
                print 'stayed put too many times ({0})'.format(max_stayed_put)
                break
            if self.resets == max_resets:
                print 'Reset limit reached ({0})'.format(max_resets)
                break
            if self.done_worse == max_done_worse:
                print 'wandered off, resetting...'
                self.resets += 1
                self.done_worse = 0
                current_assignment = local_best_assignment
            if self.i == max_iter:
                print 'max iterations reached'
                break

            new_assignment = self.move(sample_size, current_assignment,
                                       nreassign)
            score = self.Scorer.score(new_assignment, history=history)
            print score, new_assignment

            if score > local_best_score:
                self.sampled = []
                local_best_score = score
                local_best_assignment = new_assignment
                self.stayed_put = 0
                self.done_worse = 0
                self.resets = 0
            elif np.abs(score - local_best_score) < EPS:
                self.stayed_put += 1
                self.done_worse = 0
            else:
                self.sampled = []
                self.stayed_put = 0
                self.done_worse += 1
            print self._status()
            self.i += 1

        if update is True:
            self.update(local_best_assignment)

        print self._status()
        self._reset_counts()
        return local_best_assignment

    def optimise_with_merge(self, assignment, update=True, **kwargs):
        new_assignment = self.optimise(assignment, **kwargs)
        print 'Partition after {0} merges at {1}:\n{2}'.format(self.merges,
                                        sum(os.times()[:4]), new_assignment)
        opt_score = self.Scorer.score(new_assignment)
        print 'Score: {0}'.format(opt_score)
        self.merges += 1

        split = self.split_search(new_assignment)
        split = self.optimise(split, history=False, max_iter=10, **kwargs)
        print 'Partition after {0} splits at {1}:\n{2}'.format(self.merges,
                                        sum(os.times()[:4]), new_assignment)
        print 'Score: {0}'.format(self.Scorer.score(split))

        merged = self.merge_closest(split)
        merged_score = self.Scorer.score(merged)
        print 'Partition after {0} merges at {1}:\n{2}'.format(self.merges,
                                        sum(os.times()[:4]), new_assignment)
        print 'Score: {0}'.format(merged_score)

        if np.abs(merged_score - opt_score) > EPS:
            merged = self.optimise_with_merge(merged, **kwargs)
        else:
            if update is True:
                self.update(merged)
            return(merged)

    def final_assignment(self, assignment):
        n = len(assignment)
        new_assignment = self.move(n, assignment, n)
        score = self.Scorer.score(new_assignment)
        if score > self.global_best_score:
            self.global_best_score = score
            self.global_best_assignment = new_assignment
Exemple #4
0
class Optimiser(object):

    def __init__(self, nclusters, collection, tmpdir='/tmp',
                 initial_assignment=None, scorer=None):
        self.Collection = collection

        if not self.Collection.records[0].tree:
            print 'Calculating NJ trees for collection...'
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        if scorer is not None and isinstance(scorer, Scorer):
            self.scorer = scorer
        else:
            self.scorer = Scorer(self.Collection.records, analysis='nj',
                             datatype=self.datatype,
                             tmpdir=tmpdir)

        self.nclusters = nclusters
        self.tmpdir = tmpdir

        print 'Calculating initial scores...'
        if initial_assignment is None:
            initial_assignment = self.random_partition(nclusters)
        
        self.global_best_scores = {}
        self.global_best_assignments = {}
        self.global_best_scores[self.nclusters] = self.scorer.score(
            initial_assignment)
        self.global_best_assignments[self.nclusters] = initial_assignment
        
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0

    def _reset_counts(self):
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0

    def _status(self, current_assignment):
        iter_ = self.i
        n = len(current_assignment)
        curr_score = self.scorer.score(current_assignment)
        best_score = self.global_best_scores[n]

        return 'Iter:{0} Nclusters:{1} Current score:{2} Best score:{3}'.format(
            iter_, n, curr_score, best_score)

    def random_partition(self, nclusters):
        return Partition(tuple(np.random.randint(nclusters,
                         size=len(self.Collection))))

    def update(self, assignment):
        """
        method for working interactively and keeping nclusters correct
        """
        nclusters = len(assignment) # len(assignment) == number of clusters
        best_score = self.global_best_scores.get(nclusters, MINUS_INF)
        curr_score = self.scorer.score(assignment)
        if (curr_score - best_score) > EPS:
            self.global_best_assignments[nclusters] = assignment
            self.global_best_scores[nclusters] = self.scorer.score(assignment)

    def get_clusters(self, assignment):
        pvec = assignment.partition_vector
        index_dict = defaultdict(list)
        for (position, value) in enumerate(pvec):
            index_dict[value].append(position)
        return index_dict

    def get_cluster_trees(self, assignment, index_dict=None):
        index_dict = (index_dict or self.get_clusters(assignment))
        tree_dict = {}
        for (k, v) in index_dict.items():
            if not tuple(v) in self.scorer.concats:
                self.scorer.add(tuple(v))
            tree_dict[k] = self.scorer.concats[tuple(v)]
        return tree_dict

    def score_sample(self, sample, assignment):
        """
        !! changed to simply SCORE a PRE-MADE SAMPLE
        sample_size:int, assignment:Partition object
        Calculates score m*n score matrix, where m is number of alignments
        in the sample, and n in the number of clusters encoded in the
        assignment (==Partition object)
        """
        # sample = random.sample(range(len(self.Collection)), sample_size)
        cluster_trees = self.get_cluster_trees(assignment)
        scores = np.zeros((len(sample), len(cluster_trees)))
        for i, record_index in enumerate(sample):
            rec = self.Collection.records[record_index]
            for j, tree in cluster_trees.items():
                scores[i, j-1] = self.test(rec, tree)
        return (scores)

    def constrain_assignment(self, assignment, nclusters=None):
        """
        Constrain the assignment to have self.nclusters clusters
        """

        if nclusters is None:
            nclusters = self.nclusters
        if (nclusters < 1) or (nclusters > len(self.Collection)):
            raise ValueError('Invalid number of clusters: {}'.format(nclusters))
        while len(assignment.get_membership()) > nclusters:
            assignment = self.merge_closest(assignment)
        while len(assignment.get_membership()) < nclusters:
            assignment = self.split_search(assignment)
        return assignment

    def make_new_assignment(self, sample, scores, assignment, nreassign=1, 
            choose='max'):
        """
        MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS
        """

        new_clusters = scores.argmax(axis=1)
        M = scores/scores.sum(axis=1)[:, np.newaxis]
        if choose == 'max':
            reassignments = M.max(axis=1).argsort()[-nreassign:]
        elif choose == 'min':
            reassignments = M.min(axis=1).argsort()[:nreassign]

        new_assignment = list(assignment.partition_vector)

        for i in reassignments:
            new_assignment[sample[i]] = new_clusters[i]+1  
                # because cluster number is in range
                # [1,x], and new_clusters is in range [0,x-1]
        
        return Partition(tuple(new_assignment))

    def move(self, sample_size, assignment, nreassign=1, choose='max', 
            sampled=None):
        """
        !! now generates own sample and passes to scores
        wraps self.score_sample + self.new_assignment
        """

        if sampled is None:
            sampled = list()

        unsampled = set(range(len(self.Collection))) - set(sampled)

        if len(unsampled) > 0:
            if sample_size > len(unsampled):
                sample = unsampled
            else:
                sample = random.sample(unsampled, sample_size)

            self.sampled.extend(sample)
            scores = self.score_sample(sample, assignment)
            assignment = self.make_new_assignment(sample, scores, assignment,
                                                  nreassign, choose)
        return assignment

    def merge(self, assignment, label1, label2):
        pvec = ((x if x != label1 else label2)
                for x in assignment.partition_vector)
        return Partition(tuple(pvec))

    def merge_closest(self, assignment):
        print 'Finding clusters to merge...'
        clusters = self.get_clusters(assignment)
        best_score = MINUS_INF
        merging = [None, None]

        for i in clusters:
            for j in clusters:
                # print 'i = {}, j = {}'.format(i, j)
                if i >= j:
                    continue
                print 'Testing Clusters {0} and {1}'.format(i, j)
                test_assignment = self.merge(assignment, i, j)
                self.update(test_assignment)
                score = self.scorer.score(test_assignment)

                if score > best_score:
                    merging[0] = i
                    merging[1] = j
                    best_score = score
                    best_assignment = test_assignment

        print 'Merging clusters {0} and {1}'.format(*merging)
        print 'Best assignment: {0}'.format(best_assignment)
        return(best_assignment)

    def split(self, k, assignment, verbosity=1):
        """
        Function to split cluster based on least representative alignment
        """
        if verbosity > 1:
            print assignment
        members = self.get_clusters(assignment)[k]
        if len(members) == 1:
            return assignment
        elif len(members) == 2:
            new_partition_vector = list(assignment.partition_vector)
            new_partition_vector[members[0]] = max(assignment.partition_vector) + 1
            new_assignment = Partition(new_partition_vector)
            return new_assignment

        tree = self.get_cluster_trees(assignment)[k]
        alignment_scores = {}
        if verbosity > 0:
            print 'Calculating alignment scores...'

        for i in members:
            r = self.Collection.records[i]
            alignment_scores[i] = self.test(r, tree) / float(r.seqlength) 
                # per-site likelihood

        seed, min_score = min(alignment_scores.iteritems(), 
            key=operator.itemgetter(1))
        print 'Splitting on {0}.'.format(seed+1) # convert to 1-based indexing

        new_assignment = list(assignment.partition_vector)
        new_assignment[seed] = max(assignment.partition_vector) + 1
        if verbosity > 1:
            print 'New Partition: {0}'.format(new_assignment)
        if verbosity > 0:
            print 'Assigning to new partition...'

        new_assignment = Partition(new_assignment)
        scores = self.score_sample(members, new_assignment)
        assignment = self.make_new_assignment(members, scores, new_assignment,
            nreassign=len(members))
        if verbosity > 1:
            print 'Returning: {0}'.format(assignment)

        return assignment

    def split_max_var(self, assignment):
        clusters = self.get_clusters(assignment)
        var_dict = {}

        for k in clusters.keys():
            var_dict[k] = self.var(clusters[k])

        print var_dict

        cluster_to_split, var = max(clusters.iteritems(), 
            key=operator.itemgetter(1))

    def split_search(self, assignment, update=True):
        clusters = self.get_clusters(assignment)
        k = len(assignment)
        best_score = -np.Inf

        for i in clusters:
            print 'i: {0}'.format(i)
            test_assignment = self.split(i, assignment)
            # score = self.scorer.score(test_assignment)
            if len(test_assignment) == k + 1:
                score = self.scorer.score(test_assignment)
                self.update(test_assignment)
            else:
                score = -np.Inf
                print 'Something has gone wrong'
            print test_assignment
            print score

            if score > best_score:
                best_score = score
                best_assignment = test_assignment

        return best_assignment

    def test(self, record, tree, model=None):
        """
        TESTS AN ALIGNMENT AGAINST A TREE TOPOLOGY
        """
        tmp_record = copy.deepcopy(record)
        
        # if tree label set and record label set don't match
        header_set = set(tmp_record.headers)
        extra_in_tree = tree.labels - header_set
        extra_in_record = header_set - tree.labels
        
        if extra_in_tree:
            for lab in extra_in_tree:
                tmp_record.headers.append(lab)
                tmp_record.sequences.append(''.join(['-']*tmp_record.seqlength))
            tmp_record._update()

        if extra_in_record:
            for lab in extra_in_record:
                i = tmp_record.headers.index(lab)
                tmp_record.headers = tmp_record.headers[:i] + tmp_record.headers[i+1:]
                tmp_record.sequences = tmp_record.sequences[:i] + tmp_record.sequences[i+1:]
            tmp_record._update()

        alignment_file = tmp_record.write_phylip('{0}/tmp_alignment.phy'.format(
            self.tmpdir), interleaved=True)
        newick_file = tree.write_to_file('{0}/tmp_tree.nwk'.format(self.tmpdir))
        p = Phyml(tmp_record, self.tmpdir)
        p.add_tempfile(alignment_file)
        p.add_tempfile(newick_file)
        p.add_flag('-i', alignment_file)
        p.add_flag('-u', newick_file)
        p.add_flag('-b', '0')    # no bootstraps
        if tmp_record.datatype == 'dna':
            if model is None:
                model = 'GTR'
            p.add_flag('-m', model)
            p.add_flag('-d', 'nt')
        else:
            if model is None:
                model = 'WAG'
            p.add_flag('-m', model)  # evolutionary model
            p.add_flag('-d', 'aa')   # datatype    
        
        p.add_flag('-o', 'n')    # no optimisation
        return p.run().score

    def var(self, members):
        score = self.scorer.add(tuple(members)).score
        records = [self.Collection.records[i] for i in members]
        total_length = sum([r.seqlength for r in records])

        return(score / total_length)

    def optimise(self,
                 assignment,
                 nclusters=None,
                 update=True,
                 history=True,
                 sample_size=10,
                 nreassign=10,
                 max_stayed_put=10,
                 max_resets=5,
                 max_done_worse=5,
                 max_iter=1000):


        if nclusters is None:
            nclusters = self.nclusters

        assignment = self.constrain_assignment(assignment, nclusters)

        local_best_assignment = assignment
        local_best_score = self.scorer.score(local_best_assignment, 
            history=history)
        current_assignment = local_best_assignment
        self.sampled = []

        print self._status(current_assignment)

        while True:
            if self.stayed_put > max_stayed_put:
                print 'stayed put too many times ({0})'.format(max_stayed_put)
                break
            if self.resets == max_resets:
                print 'Reset limit reached ({0})'.format(max_resets)
                break
            if self.done_worse == max_done_worse:
                print 'wandered off, resetting...'
                self.resets += 1
                self.done_worse = 0
                current_assignment = local_best_assignment
            if self.i == max_iter:
                print 'max iterations reached'
                break

            new_assignment = self.move(sample_size, current_assignment,
                                       nreassign)
            new_assignment = self.constrain_assignment(new_assignment)
            score = self.scorer.score(new_assignment, history=history)
            self.update(new_assignment)
            print self._status(new_assignment)
            # print 'this', score, new_assignment

            if (score - local_best_score) > EPS:
                self.sampled = []
                local_best_score = score
                local_best_assignment = new_assignment
                self.stayed_put = 0
                self.done_worse = 0
                self.resets = 0
            elif np.abs(score - local_best_score) < EPS:
                self.stayed_put += 1
                self.done_worse = 0
            else:
                self.sampled = []
                self.stayed_put = 0
                self.done_worse += 1
            
            self.i += 1

        self._reset_counts()
        return local_best_assignment

    def optimise_with_variable_clusters(self, 
            assignment, 
            target_clusters, 
            max_clusters, 
            optimise_on_ascent=True,
            optimise_on_descent=True, 
            update=True, 
            **kwargs):

        for n in range(target_clusters, max_clusters+1):
            print "ASCENDING (optimisation:{}) -> Current target: {} clusters".format(
                ('ON' if optimise_on_ascent else 'OFF'), n)
            if optimise_on_ascent:
                assignment = self.optimise(assignment, nclusters=n, **kwargs)
            else:
                assignment = self.constrain_assignment(assignment, n)

        for n in range(max_clusters-1, target_clusters-1, -1):
            print "DESCENDING (optimisation:{}) -> Current target: {} clusters".format(
                ('ON' if optimise_on_descent else 'OFF'), n)
            if optimise_on_descent:
                assignment = self.optimise(assignment, nclusters=n, **kwargs)
            else:
                assignment = self.constrain_assignment(assignment, n)

        return self.constrain_assignment(assignment, target_clusters)

    def optimise_with_merge(self, assignment, update=True, **kwargs):
        new_assignment = self.optimise(assignment, **kwargs)
        print 'Partition after {0} merges at {1}:\n{2}'.format(self.merges,
                                        sum(os.times()[:4]), new_assignment)
        opt_score = self.scorer.score(new_assignment)
        print 'Score: {0}'.format(opt_score)
        self.merges += 1

        split = self.split_search(new_assignment)
        split = self.optimise(split, history=False, max_iter=10, **kwargs)
        print 'Partition after {0} splits at {1}:\n{2}'.format(self.merges,
                                        sum(os.times()[:4]), new_assignment)
        print 'Score: {0}'.format(self.scorer.score(split))

        merged = self.merge_closest(split)
        merged_score = self.scorer.score(merged)
        print 'Partition after {0} merges at {1}:\n{2}'.format(self.merges,
                                        sum(os.times()[:4]), new_assignment)
        print 'Score: {0}'.format(merged_score)

        if np.abs(merged_score - opt_score) > EPS:
            merged = self.optimise_with_merge(merged, **kwargs)
        else:
            if update is True:
                self.update(merged)
            return(merged)
Exemple #5
0
class Optimiser(object):
    def __init__(self,
                 nclusters,
                 collection,
                 tmpdir='/tmp',
                 initial_assignment=None):
        self.Collection = collection

        if not self.Collection.records[0].tree:
            print 'Calculating NJ trees for collection...'
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        self.Scorer = Scorer(self.Collection.records,
                             analysis='nj',
                             datatype=self.datatype,
                             tmpdir=tmpdir)

        if initial_assignment is None:
            initial_assignment = self.random_partition(nclusters)

        self.nclusters = nclusters
        self.tmpdir = tmpdir
        print 'Calculating initial scores...'
        self.global_best_score = self.Scorer.score(initial_assignment)
        self.global_best_assignment = initial_assignment
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0

    def _reset_counts(self):
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0

    def _status(self):
        return '{0} {1} {2}'.format(self.i, self.global_best_score,
                                    self.global_best_assignment)

    def random_partition(self, nclusters):
        return Partition(
            tuple(np.random.randint(nclusters, size=len(self.Collection))))

    def update(self, assignment):
        """
        method for working interactively and keeping nclusters correct
        """
        self.global_best_assignment = assignment
        self.global_best_score = self.Scorer.score(assignment)
        self.nclusters = max(assignment.partition_vector)

    def get_clusters(self, assignment):
        pvec = assignment.partition_vector
        index_dict = defaultdict(list)
        for (position, value) in enumerate(pvec):
            index_dict[value].append(position)
        return index_dict

    def get_cluster_trees(self, assignment, index_dict=None):
        index_dict = (index_dict or self.get_clusters(assignment))
        tree_dict = {}
        for (k, v) in index_dict.items():
            if not tuple(v) in self.Scorer.concats:
                self.Scorer.add(tuple(v))
            tree_dict[k] = self.Scorer.concats[tuple(v)]
        return tree_dict

    def score_sample(self, sample, assignment):
        """
        !! changed to simply SCORE a PRE-MADE SAMPLE
        sample_size:int, assignment:Partition object
        Calculates score m*n score matrix, where m is number of alignments
        in the sample, and n in the number of clusters encoded in the
        assignment (==Partition object)
        """
        # sample = random.sample(range(len(self.Collection)), sample_size)
        cluster_trees = self.get_cluster_trees(assignment)
        scores = np.zeros((len(sample), len(cluster_trees)))
        for i, record_index in enumerate(sample):
            rec = self.Collection.records[record_index]
            for j, tree in cluster_trees.items():
                scores[i, j - 1] = self.test(rec, tree)
        return (scores)

    def make_new_assignment(self,
                            sample,
                            scores,
                            assignment,
                            nreassign=1,
                            choose='max'):
        """
        MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS
        """

        new_clusters = scores.argmax(axis=1)
        M = scores / scores.sum(axis=1)[:, np.newaxis]
        if choose == 'max':
            reassignments = M.max(axis=1).argsort()[-nreassign:]
        elif choose == 'min':
            reassignments = M.min(axis=1).argsort()[:nreassign]

        new_assignment = list(assignment.partition_vector)

        for i in reassignments:
            new_assignment[sample[
                i]] = new_clusters[i] + 1  # because cluster number is in range
            # [1,x], and new_clusters is in range [0,x-1]
        return Partition(tuple(new_assignment))

    def move(self,
             sample_size,
             assignment,
             nreassign=1,
             choose='max',
             sampled=[]):
        """
        !! now generates own sample and passes to scores
        wraps self.score_sample + self.new_assignment
        """
        unsampled = set(range(len(self.Collection))) - set(sampled)

        if len(unsampled) > 0:
            if sample_size > len(unsampled):
                sample = unsampled
            else:
                sample = random.sample(unsampled, sample_size)

            self.sampled.extend(sample)
            scores = self.score_sample(sample, assignment)
            assignment = self.make_new_assignment(sample, scores, assignment,
                                                  nreassign, choose)
        return assignment

    def merge(self, assignment, label1, label2):
        pvec = ((x if x != label1 else label2)
                for x in assignment.partition_vector)
        return Partition(tuple(pvec))

    def merge_closest(self, assignment):
        print 'Finding clusters to merge...'
        clusters = self.get_clusters(assignment)
        best_score = -np.inf

        for i in clusters:
            for j in clusters:
                # print 'i = {}, j = {}'.format(i, j)
                if i == j:
                    continue
                print 'Testing Clusters {0} and {1}'.format(i, j)
                test_assignment = self.merge(assignment, i, j)
                score = self.Scorer.score(test_assignment)

                if score > best_score:
                    best_score = score
                    best_assignment = test_assignment

        print 'Best assignment: {0}'.format(best_assignment)
        return (best_assignment)

    def split(self, k, assignment):
        """
        Function to split cluster based on least representative alignment
        """
        print assignment
        members = self.get_clusters(assignment)[k]
        tree = self.get_cluster_trees(assignment)[k]
        alignment_scores = {}
        print 'Calculating alignment scores...'

        for i in members:
            r = self.Collection.records[i]
            alignment_scores[i] = self.test(r, tree)

        seed, min_score = min(alignment_scores.iteritems(),
                              key=operator.itemgetter(1))
        print 'Splitting on {0}.'.format(seed)

        new_assignment = list(assignment.partition_vector)
        new_assignment[seed] = max(assignment.partition_vector) + 1
        print 'New Partition: {0}'.format(new_assignment)
        print 'Assigning to new partition...'

        new_assignment = Partition(new_assignment)
        scores = self.score_sample(members, new_assignment)
        assignment = self.make_new_assignment(members,
                                              scores,
                                              new_assignment,
                                              nreassign=len(members))
        print 'Returning: {0}'.format(assignment)

        return assignment

    def split_max_var(self, assignment):
        clusters = self.get_clusters(assignment)
        var_dict = {}

        for k in clusters.keys():
            var_dict[k] = self.var(clusters[k])

        print var_dict

        cluster_to_split, var = max(clusters.iteritems(),
                                    key=operator.itemgetter(1))

    def split_search(self, assignment):
        clusters = self.get_clusters(assignment)
        k = max(assignment.partition_vector)
        best_score = -np.Inf

        for i in clusters:
            print 'i: {0}'.format(i)
            test_assignment = self.split(i, assignment)
            score = self.Scorer.score(test_assignment)
            if max(test_assignment.partition_vector) == k + 1:
                score = self.Scorer.score(test_assignment)
            else:
                score = -np.Inf
                print 'Something has gone wrong'
            print test_assignment
            print score

            if score > best_score:
                best_score = score
                best_assignment = test_assignment
                # print 'New High Watermark'

        return best_assignment

    def test(self, record, tree, model='WAG'):
        """
        TESTS AN ALIGNMENT AGAINST A TREE TOPOLOGY
        """
        alignment_file = record.write_phylip('{0}/tmp_alignment.phy'.format(
            self.tmpdir),
                                             interleaved=True)
        newick_file = tree.write_to_file('{0}/tmp_tree.nwk'.format(
            self.tmpdir))
        p = Phyml(record)
        p.add_tempfile(alignment_file)
        p.add_tempfile(newick_file)
        p.add_flag('-i', alignment_file)
        p.add_flag('-u', newick_file)
        p.add_flag('-b', '0')  # no bootstraps
        p.add_flag('-m', model)  # evolutionary model
        p.add_flag('-o', 'n')  # no optimisation
        p.add_flag('-d', 'aa')  # datatype
        return p.run().score

    def var(self, members):
        score = self.Scorer.add(tuple(members)).score
        records = [self.Collection.records[i] for i in members]
        total_length = sum([r.seqlength for r in records])

        return (score / total_length)

    def optimise(self,
                 assignment,
                 update=True,
                 history=True,
                 sample_size=10,
                 nreassign=10,
                 max_stayed_put=5,
                 max_resets=5,
                 max_done_worse=5,
                 max_iter=1000):

        local_best_assignment = assignment
        local_best_score = self.Scorer.score(local_best_assignment,
                                             history=history)
        current_assignment = local_best_assignment
        self.sampled = []

        print 'Optimising: {0} {1} {2}'.format(self.i, local_best_score,
                                               current_assignment)

        while True:
            if self.stayed_put > max_stayed_put:
                print 'stayed put too many times ({0})'.format(max_stayed_put)
                break
            if self.resets == max_resets:
                print 'Reset limit reached ({0})'.format(max_resets)
                break
            if self.done_worse == max_done_worse:
                print 'wandered off, resetting...'
                self.resets += 1
                self.done_worse = 0
                current_assignment = local_best_assignment
            if self.i == max_iter:
                print 'max iterations reached'
                break

            new_assignment = self.move(sample_size, current_assignment,
                                       nreassign)
            score = self.Scorer.score(new_assignment, history=history)
            print score, new_assignment

            if score > local_best_score:
                self.sampled = []
                local_best_score = score
                local_best_assignment = new_assignment
                self.stayed_put = 0
                self.done_worse = 0
                self.resets = 0
            elif np.abs(score - local_best_score) < EPS:
                self.stayed_put += 1
                self.done_worse = 0
            else:
                self.sampled = []
                self.stayed_put = 0
                self.done_worse += 1
            print self._status()
            self.i += 1

        if update is True:
            self.update(local_best_assignment)

        print self._status()
        self._reset_counts()
        return local_best_assignment

    def optimise_with_merge(self, assignment, update=True, **kwargs):
        new_assignment = self.optimise(assignment, **kwargs)
        print 'Partition after {0} merges at {1}:\n{2}'.format(
            self.merges, sum(os.times()[:4]), new_assignment)
        opt_score = self.Scorer.score(new_assignment)
        print 'Score: {0}'.format(opt_score)
        self.merges += 1

        split = self.split_search(new_assignment)
        split = self.optimise(split, history=False, max_iter=10, **kwargs)
        print 'Partition after {0} splits at {1}:\n{2}'.format(
            self.merges, sum(os.times()[:4]), new_assignment)
        print 'Score: {0}'.format(self.Scorer.score(split))

        merged = self.merge_closest(split)
        merged_score = self.Scorer.score(merged)
        print 'Partition after {0} merges at {1}:\n{2}'.format(
            self.merges, sum(os.times()[:4]), new_assignment)
        print 'Score: {0}'.format(merged_score)

        if np.abs(merged_score - opt_score) > EPS:
            merged = self.optimise_with_merge(merged, **kwargs)
        else:
            if update is True:
                self.update(merged)
            return (merged)

    def final_assignment(self, assignment):
        n = len(assignment)
        new_assignment = self.move(n, assignment, n)
        score = self.Scorer.score(new_assignment)
        if score > self.global_best_score:
            self.global_best_score = score
            self.global_best_assignment = new_assignment