Python Scorer Examples

Programming Language: Python

Namespace/Package Name: collection

Class/Type: Scorer

Examples at hotexamples.com: 11

Python Scorer - 11 examples found. These are the top rated real world Python examples of collection.Scorer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Scorer(4)

add(2)

score(2)

history(1)

Example #1

Show file

File: algorithms.py Project: mgperry/treeCl

    def __init__(
        self,
        collection,
        nclusters,
        metric='euc',
        tmpdir=None,
    ):

        if not isinstance(nclusters, int) or nclusters <= 1:
            raise Exception('Need appropriate value for number of clusters.')

        self.nclusters = nclusters
        self.scorer = Scorer(collection.records,
                             collection.analysis)  # Could check for entries
        self.datatype = collection.datatype
        self.metric = metric

        try:
            self.tmpdir
        except:
            self.tmpdir = collection.tmpdir

Example #2

Show file

File: optimiser.py Project: xypan1232/treeCl

    def __init__(self, nclusters, collection, tmpdir=TMPDIR,
                 analysis='nj', initial_assignment=None, scorer=None):
        optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection'])
        if self.analysis == 'tc':
            self.analysis = 'TreeCollection'
        else:
            self.analysis = analysis

        self.Collection = collection

        if not self.Collection.records[0].tree:
            print('Calculating {} trees for collection...'.format(analysis))
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        if scorer is not None and isinstance(scorer, Scorer):
            self.scorer = scorer
        else:
            self.scorer = Scorer(self.Collection)

        self.nclusters = nclusters
        self.tmpdir = tmpdir

        print('Calculating initial scores...')
        if initial_assignment is None:
            initial_assignment = Partition(tuple([0] * len(collection)))
            # initial_assignment = self.random_partition(nclusters)

        self.global_best_scores = {}
        self.global_best_assignments = {}
        self.global_best_scores[self.nclusters] = self.scorer.score(
            initial_assignment, history=True)
        self.global_best_assignments[self.nclusters] = initial_assignment

        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0

Example #3

Show file

File: optimiser.py Project: verajohne/treeCl

    def __init__(self, nclusters, collection, tmpdir=TMPDIR,
                 analysis='nj', initial_assignment=None, scorer=None):
        optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection'])
        if self.analysis == 'tc':
            self.analysis = 'TreeCollection'
        else:
            self.analysis = analysis

        self.Collection = collection

        if not self.Collection.records[0].tree:
            print('Calculating {} trees for collection...'.format(analysis))
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        if scorer is not None and isinstance(scorer, Scorer):
            self.scorer = scorer
        else:
            self.scorer = Scorer(self.Collection.records,
                                 analysis=analysis,
                                 datatype=self.datatype,
                                 tmpdir=tmpdir)

        self.nclusters = nclusters
        self.tmpdir = tmpdir

        print('Calculating initial scores...')
        if initial_assignment is None:
            initial_assignment = Partition(tuple([0]*len(collection)))
            # initial_assignment = self.random_partition(nclusters)

        self.global_best_scores = {}
        self.global_best_assignments = {}
        self.global_best_scores[self.nclusters] = self.scorer.score(
            initial_assignment, history=True)
        self.global_best_assignments[self.nclusters] = initial_assignment

        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0

Example #4

Show file

File: algorithms.py Project: mgperry/treeCl

    def __init__(
        self,
        collection,
        nclusters,
        metric='euc',
        tmpdir=None,
    ):

        if not isinstance(nclusters, int) or nclusters <= 1:
            raise Exception('Need appropriate value for number of clusters.')

        self.nclusters = nclusters
        self.scorer = Scorer(collection.records, collection.analysis)  # Could check for entries
        self.datatype = collection.datatype
        self.metric = metric

        try:
            self.tmpdir
        except:
            self.tmpdir = collection.tmpdir

Example #5

Show file

File: optimiser.py Project: verajohne/treeCl

class Optimiser(object):

    def __init__(self, nclusters, collection, tmpdir=TMPDIR,
                 analysis='nj', initial_assignment=None, scorer=None):
        optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection'])
        if self.analysis == 'tc':
            self.analysis = 'TreeCollection'
        else:
            self.analysis = analysis

        self.Collection = collection

        if not self.Collection.records[0].tree:
            print('Calculating {} trees for collection...'.format(analysis))
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        if scorer is not None and isinstance(scorer, Scorer):
            self.scorer = scorer
        else:
            self.scorer = Scorer(self.Collection.records,
                                 analysis=analysis,
                                 datatype=self.datatype,
                                 tmpdir=tmpdir)

        self.nclusters = nclusters
        self.tmpdir = tmpdir

        print('Calculating initial scores...')
        if initial_assignment is None:
            initial_assignment = Partition(tuple([0]*len(collection)))
            # initial_assignment = self.random_partition(nclusters)

        self.global_best_scores = {}
        self.global_best_assignments = {}
        self.global_best_scores[self.nclusters] = self.scorer.score(
            initial_assignment, history=True)
        self.global_best_assignments[self.nclusters] = initial_assignment

        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0

    def _reset_counts(self):
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0

    def status(self, current_assignment, details=None):
        iter_ = self.i
        n = len(current_assignment)
        curr_score = self.scorer.score(current_assignment, history=False)
        best_score = self.global_best_scores[n]
        details = ('\t'+str(details) if details is not None else '')

        return 'Iter:{0}\tNclusters:{1}\tCurrent\tscore:{2}\tBest score:{3}{4}'.format(
            iter_, n, curr_score, best_score, details)

    def random_partition(self, nclusters):
        return Partition(tuple(np.random.randint(nclusters,
                         size=len(self.Collection))))

    def update(self, assignment):
        """
        method for working interactively and keeping nclusters correct
        """
        nclusters = len(assignment) # len(assignment) == number of clusters
        best_score = self.global_best_scores.get(nclusters, NEGINF)
        curr_score = self.scorer.score(assignment, history=False)
        if (curr_score - best_score) > EPS:
            self.global_best_assignments[nclusters] = assignment
            self.global_best_scores[nclusters] = self.scorer.score(assignment,
                history=False)

    def get_clusters(self, assignment):
        pvec = assignment.partition_vector
        index_dict = defaultdict(list)
        for (position, value) in enumerate(pvec):
            index_dict[value].append(position)
        return index_dict

    def get_cluster_trees(self, assignment, index_dict=None):
        index_dict = (index_dict or self.get_clusters(assignment))
        tree_dict = {}
        for (k, v) in index_dict.items():
            if not tuple(v) in self.scorer.concats:
                self.scorer.add(tuple(v))
            tree_dict[k] = self.scorer.concats[tuple(v)]
        return tree_dict

    def score_sample(self, sample, assignment):
        """
        !! changed to simply SCORE a PRE-MADE SAMPLE
        sample_size:int, assignment:Partition object
        Calculates score m*n score matrix, where m is number of alignments
        in the sample, and n in the number of clusters encoded in the
        assignment (==Partition object)
        """
        # sample = random.sample(range(len(self.Collection)), sample_size)
        cluster_trees = self.get_cluster_trees(assignment)
        scores = np.zeros((len(sample), len(cluster_trees)))
        for i, record_index in enumerate(sample):
            rec = self.Collection.records[record_index]
            for j, tree in cluster_trees.items():
                scores[i, j-1] = self.test(rec, tree)
        return (scores)

    def constrain_assignment(self, assignment, nclusters=None):
        """
        Constrain the assignment to have self.nclusters clusters
        """

        if nclusters is None:
            nclusters = self.nclusters
        if (nclusters < 1) or (nclusters > len(self.Collection)):
            raise ValueError('Invalid number of clusters: {}'.format(nclusters))
        while len(assignment.get_membership()) > nclusters:
            assignment = self.merge_closest(assignment)
        while len(assignment.get_membership()) < nclusters:
            assignment = self.split_search(assignment)
        return assignment

    def make_new_assignment(self, sample, scores, assignment, nreassign=1,
            choose='max'):
        """
        MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS
        """

        new_clusters = scores.argmax(axis=1)
        M = scores/scores.sum(axis=1)[:, np.newaxis]
        if choose == 'max':
            reassignments = M.max(axis=1).argsort()[-nreassign:]
        elif choose == 'min':
            reassignments = M.min(axis=1).argsort()[:nreassign]

        new_assignment = list(assignment.partition_vector)

        for i in reassignments:
            new_assignment[sample[i]] = new_clusters[i]+1
                # because cluster number is in range
                # [1,x], and new_clusters is in range [0,x-1]

        return Partition(tuple(new_assignment))

    def move(self, sample_size, assignment, nreassign=1, choose='max',
            sampled=None):
        """
        !! now generates own sample and passes to scores
        wraps self.score_sample + self.new_assignment
        """

        if sampled is None:
            sampled = list()

        unsampled = set(range(len(self.Collection))) - set(sampled)

        if len(unsampled) > 0:
            if sample_size > len(unsampled):
                sample = unsampled
            else:
                sample = random.sample(unsampled, sample_size)

            self.sampled.extend(sample)
            scores = self.score_sample(sample, assignment)
            assignment = self.make_new_assignment(sample, scores, assignment,
                                                  nreassign, choose)
        return assignment

    def merge(self, assignment, label1, label2):
        pvec = ((x if x != label1 else label2)
                for x in assignment.partition_vector)
        return Partition(tuple(pvec))

    def merge_closest(self, assignment):
        print('Finding clusters to merge...')
        clusters = self.get_clusters(assignment)
        best_score = NEGINF
        merging = [None, None]

        for i in clusters:
            for j in clusters:
                # print('i = {}, j = {}'.format(i, j))
                if i >= j:
                    continue
                print('Testing Clusters {0} and {1}'.format(i, j))
                test_assignment = self.merge(assignment, i, j)
                self.update(test_assignment)
                score = self.scorer.score(test_assignment, history=False)

                if score > best_score:
                    merging[0] = i
                    merging[1] = j
                    best_score = score
                    best_assignment = test_assignment

        print('Merging clusters {0} and {1}'.format(*merging))
        print('Best assignment: {0}'.format(best_assignment))
        return(best_assignment)

    def split(self, k, assignment, verbosity=1):
        """
        Function to split cluster based on least representative alignment
        """
        if verbosity > 1:
            print(assignment)
        members = self.get_clusters(assignment)[k]
        if len(members) == 1:
            return assignment
        elif len(members) == 2:
            new_partition_vector = list(assignment.partition_vector)
            new_partition_vector[members[0]] = max(assignment.partition_vector) + 1
            new_assignment = Partition(new_partition_vector)
            return new_assignment

        tree = self.get_cluster_trees(assignment)[k]
        alignment_scores = {}
        if verbosity > 0:
            print('Calculating alignment scores...')

        for i in members:
            r = self.Collection.records[i]
            alignment_scores[i] = self.test(r, tree) / float(r.seqlength)
                # per-site likelihood

        seed, min_score = min(alignment_scores.iteritems(),
            key=operator.itemgetter(1))
        print('Splitting on {0}.'.format(seed+1))# convert to 1-based indexing

        new_assignment = list(assignment.partition_vector)
        new_assignment[seed] = max(assignment.partition_vector) + 1
        if verbosity > 1:
            print('New Partition: {0}'.format(new_assignment))
        if verbosity > 0:
            print('Assigning to new partition...')

        new_assignment = Partition(new_assignment)
        scores = self.score_sample(members, new_assignment)
        assignment = self.make_new_assignment(members, scores, new_assignment,
            nreassign=len(members))
        if verbosity > 1:
            print('Returning: {0}'.format(assignment))

        return assignment

    def split_max_var(self, assignment):
        clusters = self.get_clusters(assignment)
        var_dict = {}

        for k in clusters.keys():
            var_dict[k] = self.var(clusters[k])

        print(var_dict)

        cluster_to_split, var = max(clusters.iteritems(),
            key=operator.itemgetter(1))

    def split_search(self, assignment, update=True):
        clusters = self.get_clusters(assignment)
        k = len(assignment)
        best_score = NEGINF

        for i in clusters:
            print('i: {0}'.format(i))
            test_assignment = self.split(i, assignment)
            # score = self.scorer.score(test_assignment)
            if len(test_assignment) == k + 1:
                score = self.scorer.score(test_assignment, history=False)
                self.update(test_assignment)
            else:
                score = -np.Inf
                print('Something has gone wrong')
            print(test_assignment)
            print(score)

            if score > best_score:
                best_score = score
                best_assignment = test_assignment

        return best_assignment

    def test(self, record, tree, model=None):
        """
        TESTS AN ALIGNMENT AGAINST A TREE TOPOLOGY
        """
        tmp_record = copy.deepcopy(record)

        # if tree label set and record label set don't match
        header_set = set(tmp_record.headers)
        extra_in_tree = tree.labels - header_set
        extra_in_record = header_set - tree.labels

        if extra_in_tree:
            for lab in extra_in_tree:
                tmp_record.headers.append(lab)
                tmp_record.sequences.append(''.join(['-']*tmp_record.seqlength))
            tmp_record._update()

        if extra_in_record:
            for lab in extra_in_record:
                i = tmp_record.headers.index(lab)
                tmp_record.headers   = (tmp_record.headers[:i] +
                                        tmp_record.headers[i+1:])
                tmp_record.sequences = (tmp_record.sequences[:i] +
                                        tmp_record.sequences[i+1:])
            tmp_record._update()

        return tmp_alignment.likelihood(tree, self.tmpdir, fit_rates=True)
        # alignment_file = tmp_record.write_phylip('{0}/tmp_alignment.phy'.format(
        #     self.tmpdir), interleaved=True)
        # newick_file = tree.write_to_file('{0}/tmp_tree.nwk'.format(self.tmpdir))
        # p = Phyml(tmp_record, self.tmpdir)
        # p.add_tempfile(alignment_file)
        # p.add_tempfile(newick_file)
        # p.add_flag('-i', alignment_file)
        # p.add_flag('-u', newick_file)
        # p.add_flag('-b', '0')    # no bootstraps
        # if tmp_record.datatype == 'dna':
        #     if model is None:
        #         model = 'GTR'
        #     p.add_flag('-m', model)
        #     p.add_flag('-d', 'nt')
        # else:
        #     if model is None:
        #         model = 'WAG'
        #     p.add_flag('-m', model)  # evolutionary model
        #     p.add_flag('-d', 'aa')   # datatype

        # p.add_flag('-o', 'n')    # no optimisation
        # return p.run().score

    def var(self, members):
        score = self.scorer.add(tuple(members)).score
        records = [self.Collection.records[i] for i in members]
        total_length = sum([r.seqlength for r in records])

        return(score / total_length)

    def optimise(self,
                 assignment,
                 nclusters=None,
                 update=True,
                 history=True,
                 sample_size=10,
                 nreassign=10,
                 max_stayed_put=25,
                 max_resets=5,
                 max_done_worse=5,
                 max_iter=1000):

        if nclusters is None:
            nclusters = self.nclusters

        assignment = self.constrain_assignment(assignment, nclusters)

        local_best_assignment = assignment
        local_best_score = self.scorer.score(local_best_assignment,
            history=False)
        current_assignment = local_best_assignment
        self.sampled = []

        print(self.status(current_assignment))

        while True:
            if self.stayed_put > max_stayed_put:
                print('stayed put too many times ({0})'.format(max_stayed_put))
                break
            if self.resets == max_resets:
                print('Reset limit reached ({0})'.format(max_resets))
                break
            if self.done_worse == max_done_worse:
                print('wandered off, resetting...')
                self.resets += 1
                self.done_worse = 0
                current_assignment = local_best_assignment
            if self.i == max_iter:
                print('max iterations reached')
                break

            new_assignment = self.move(sample_size, current_assignment,
                                       nreassign)
            new_assignment = self.constrain_assignment(new_assignment,
                nclusters)
            score = self.scorer.score(new_assignment, history=history)
            self.update(new_assignment)

            if (score - local_best_score) > EPS:
                self.sampled = []
                local_best_score = score
                local_best_assignment = new_assignment
                self.stayed_put = 0
                self.done_worse = 0
                self.resets = 0
                print(self.status(new_assignment, '(Improved)'))
            elif np.abs(score - local_best_score) < EPS:
                self.stayed_put += 1
                self.done_worse = 0
                message = ('(No improvement - [{}/{}])'.format(self.stayed_put,
                                                               max_stayed_put))
                print(self.status(new_assignment, message))
            else:
                self.sampled = []
                #self.stayed_put = 0
                self.done_worse += 1
                message = '(Did worse - [{}/{}]'.format(self.done_worse,
                                                        max_done_worse)
                print(self.status(new_assignment, message))

            self.i += 1

        self._reset_counts()
        return local_best_assignment

    def optimise_with_variable_clusters(self,
            assignment,
            target_clusters,
            max_clusters,
            optimise_on_ascent=True,
            optimise_on_descent=True,
            update=True,
            **kwargs):

        if max_clusters < target_clusters:
            raise ValueError('max_clusters ({}) must be at least equal to '
                'target_clusters ({})'.format(max_clusters, target_clusters))

        current_clusters = len(assignment)
        print('Optimising current assignment with {} clusters. Optimiser will '
              'ascend to {} clusters, and descend to a target of {} clusters'
              '.'.format(current_clusters, max_clusters, target_clusters))
        for n in range(current_clusters, max_clusters+1):
            print("ASCENDING (optimisation:{}) -> Current target: "
                  "{} clusters".format(('ON' if optimise_on_ascent else 'OFF'),
                                       n))
            if optimise_on_ascent:
                assignment = self.optimise(assignment, nclusters=n, **kwargs)
            else:
                assignment = self.constrain_assignment(assignment, n)

        for n in range(max_clusters-1, target_clusters-1, -1):
            print('DESCENDING (optimisation:{}) -> Current target: {} '
                  'clusters'.format(('ON' if optimise_on_descent else 'OFF'),
                                    n))
            if optimise_on_descent:
                assignment = self.optimise(assignment, nclusters=n, **kwargs)
            else:
                assignment = self.constrain_assignment(assignment, n)

        return self.constrain_assignment(assignment, target_clusters)

    def write(self, filename):
        headers = ['Iteration', 'CPU Time', 'Likelihood', 'Partition',
                   'NClusters']
        output = [[i] + x + len(x[-1])
                    for (i, x) in enumerate(self.scorer.history)]

        with open(filename, 'w+') as file_:
            writer = csv.writer(file_, delimiter='\t', quoting=csv.QUOTE_NONE)
            writer.writerow(headers)
            writer.writerows(output)

Example #6

Show file

File: algorithms.py Project: mgperry/treeCl

 def __init__(self, members, records, analysis):
     self.members = tuple(members)
     self.records = [records[i] for i in self.members]
     self.scorer = Scorer(records, analysis)
     self.tree = self.scorer.add(self.members)

Example #7

Show file

File: algorithms.py Project: mgperry/treeCl

class EMTrees(object):
    def __init__(
        self,
        collection,
        nclusters,
        metric='euc',
        tmpdir=None,
    ):

        if not isinstance(nclusters, int) or nclusters <= 1:
            raise Exception('Need appropriate value for number of clusters.')

        self.nclusters = nclusters
        self.scorer = Scorer(collection.records,
                             collection.analysis)  # Could check for entries
        self.datatype = collection.datatype
        self.metric = metric

        try:
            self.tmpdir
        except:
            self.tmpdir = collection.tmpdir

    def clusters_init(self):
        k = self.nclusters
        assignment = [0] * len(self.scorer.records)
        for i in range(k):
            assignment[np.random.randint(0, len(assignment))] = i + 1
        partition = Partition(assignment)
        clusters = [0] * k
        members = partition.get_membership()[1:]
        self.assign_clusters(clusters, members)
        for (index, record) in enumerate(self.scorer.records):
            scores = [
                self.ml(record, clusters[n]) for n in range(self.nclusters)
            ]
            # print scores
            if assignment.count(
                    assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1
        self.partition = Partition(assignment)
        self.L = self.scorer.score(self.partition)

    def random_partition(self):
        self.partition = Partition(
            tuple(
                np.random.randint(self.nclusters,
                                  size=len(self.scorer.records))))
        self.L = self.scorer.score(self.partition)

    def assign_clusters(self, clusters, members):
        for n in range(self.nclusters):
            if not clusters[n] or clusters[n].members != members[n]:
                clusters[n] = Cluster(members[n], self.scorer.records,
                                      self.scorer.analysis)

        return (clusters)

    def maximise(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            for (index, record) in enumerate(self.scorer.records):
                scores = [
                    alg(record, clusters[n]) for n in range(self.nclusters)
                ]
                # print scores
                if assignment.count(
                        assignment[index]) > 1 or assignment[index] == 0:
                    assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment

            else:
                count += 1
                if count > 1:
                    break  # Algorithm is deterministic so no need for more iterations

    def maximise_random(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0
        sampled = []

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            if index in sampled:
                continue
            else:
                record = self.scorer.records[index]
                sampled.append(index)

            scores = [alg(record, clusters[n]) for n in range(self.nclusters)]

            if assignment.count(
                    assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment
                sampled = []
                count = 0
            else:
                count += 1
                if count == len(assignment): break

    def maximise_heuristic(self):
        clusters = [0] * self.nclusters
        sampled = []

        for i in range(1000):
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            record = self.scorer.records[index]
            sampled.append(index)

            lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)]

            a = {'ll': max(lls)}
            a['n'] = lls.index(a['ll'])
            lls.pop(a['n'])

            b = {'ll': max(lls)}
            b['n'] = lls.index(b['ll'])

            a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll']))

            if np.random.uniform() > a['p']:
                choice = a['n']
            else:
                choice = b['n']

            if assignment.count(
                    assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = choice + 1

            assignment = Partition(assignment)

            if i % 10 == 0:
                score = self.scorer.score(assignment)

                if score > self.L:
                    self.max_L = score
                    self.max_partition = assignment

    def dist(self, obj1, obj2):
        distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1]
        return (-distance)

    def ml(self, record, cluster, verbose=1):
        p = Phyml(record, tmpdir=self.tmpdir)
        input_tree = os.path.join(self.tmpdir, 'input_tree')
        cluster.tree.write_to_file(input_tree)
        p.add_tempfile(input_tree)
        p.add_flag('--inputtree', input_tree)
        p.add_flag('-o', 'r')  # Optimise only on substitutions`
        p.add_flag('-a', 'e')
        p.add_flag('-b', 0)
        p.add_flag('-c', 4)
        p.add_flag('--quiet', '')

        if self.datatype == 'protein':
            p.add_flag('-d', 'aa')
        elif self.datatype == 'dna':
            p.add_flag('-d', 'nt')

        score = p.run(verbosity=verbose).score
        return (score)

Example #8

Show file

#!/usr/bin/env python

from collection import Collection, Scorer
from clustering import Partition
from random import randint
from anneal import *
import pickle

c = Collection(input_dir='/homes/mgperry/treeCl_data/easy_case/', compression='gz', file_format='phylip', datatype='protein')

scorer = Scorer(c.records, 'nj')

k = 4

partition = [randint(1, k) for rec in scorer.records]


def likelihood(partition, scorer):
    score = scorer.score(Partition(partition))
    return(score)

print type(partition)

opts = {'func': likelihood,
        'x0': partition,
        'args': [scorer],
        'schedule': 'cluster',
        'full_output': 1,
        'T0': 100000,
        'Tf': 1,
        'maxeval': None,

Example #9

Show file

File: optimiser.py Project: xypan1232/treeCl

class Optimiser(object):
    def __init__(self, nclusters, collection, tmpdir=TMPDIR,
                 analysis='nj', initial_assignment=None, scorer=None):
        optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection'])
        if self.analysis == 'tc':
            self.analysis = 'TreeCollection'
        else:
            self.analysis = analysis

        self.Collection = collection

        if not self.Collection.records[0].tree:
            print('Calculating {} trees for collection...'.format(analysis))
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        if scorer is not None and isinstance(scorer, Scorer):
            self.scorer = scorer
        else:
            self.scorer = Scorer(self.Collection)

        self.nclusters = nclusters
        self.tmpdir = tmpdir

        print('Calculating initial scores...')
        if initial_assignment is None:
            initial_assignment = Partition(tuple([0] * len(collection)))
            # initial_assignment = self.random_partition(nclusters)

        self.global_best_scores = {}
        self.global_best_assignments = {}
        self.global_best_scores[self.nclusters] = self.scorer.score(
            initial_assignment, history=True)
        self.global_best_assignments[self.nclusters] = initial_assignment

        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0

    def _reset_counts(self):
        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0

    def status(self, current_assignment, details=None):
        iter_ = self.i
        n = len(current_assignment)
        curr_score = self.scorer.score(current_assignment, history=False)
        best_score = self.global_best_scores[n]
        details = ('\t' + str(details) if details is not None else '')

        return 'Iter:{0}\tNclusters:{1}\tCurrent\tscore:{2}\tBest score:{3}{4}'.format(
            iter_, n, curr_score, best_score, details)

    def random_partition(self, nclusters):
        return Partition(tuple(np.random.randint(nclusters,
                                                 size=len(self.Collection))))

    def update(self, assignment):
        """
        method for working interactively and keeping nclusters correct
        """
        nclusters = len(assignment)  # len(assignment) == number of clusters
        best_score = self.global_best_scores.get(nclusters, NEGINF)
        curr_score = self.scorer.score(assignment, history=False)
        if (curr_score - best_score) > EPS:
            self.global_best_assignments[nclusters] = assignment
            self.global_best_scores[nclusters] = self.scorer.score(assignment,
                                                                   history=False)

    def get_cluster_trees(self, assignment, index_dict=None):
        index_dict = (index_dict or get_clusters(assignment))
        tree_dict = {}
        for (k, v) in index_dict.items():
            if not tuple(v) in self.scorer.concats:
                self.scorer.add(tuple(v))
            tree_dict[k] = self.scorer.concats[tuple(v)]
        return tree_dict

    def score_sample(self, sample, assignment):
        """
        !! changed to simply SCORE a PRE-MADE SAMPLE
        sample_size:int, assignment:Partition object
        Calculates score m*n score matrix, where m is number of alignments
        in the sample, and n in the number of clusters encoded in the
        assignment (==Partition object)
        """
        # sample = random.sample(range(len(self.Collection)), sample_size)
        cluster_trees = self.get_cluster_trees(assignment)
        scores = np.zeros((len(sample), len(cluster_trees)))
        for i, record_index in enumerate(sample):
            rec = self.Collection.records[record_index]
            for j, tree in cluster_trees.items():
                scores[i, j - 1] = self.test(rec, tree)
        return scores

    def constrain_assignment(self, assignment, nclusters=None):
        """
        Constrain the assignment to have self.nclusters clusters
        """

        if nclusters is None:
            nclusters = self.nclusters
        if (nclusters < 1) or (nclusters > len(self.Collection)):
            raise ValueError('Invalid number of clusters: {}'.format(nclusters))
        while len(assignment.get_membership()) > nclusters:
            assignment = self.merge_closest(assignment)
        while len(assignment.get_membership()) < nclusters:
            assignment = self.split_search(assignment)
        return assignment

    def make_new_assignment(self, sample, scores, assignment, nreassign=1,
                            choose='max'):
        """
        MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS
        """
        optioncheck(choose, ('max', 'min'))
        new_clusters = scores.argmax(axis=1)
        M = scores / scores.sum(axis=1)[:, np.newaxis]
        if choose == 'max':
            reassignments = M.max(axis=1).argsort()[-nreassign:]
        else:
            reassignments = M.min(axis=1).argsort()[:nreassign]

        new_assignment = list(assignment.partition_vector)

        for i in reassignments:
            new_assignment[sample[i]] = new_clusters[i] + 1
            # because cluster number is in range
            # [1,x], and new_clusters is in range [0,x-1]

        return Partition(tuple(new_assignment))

    def move(self, sample_size, assignment, nreassign=1, choose='max',
             sampled=None):
        """
        !! now generates own sample and passes to scores
        wraps self.score_sample + self.new_assignment
        """

        if sampled is None:
            sampled = list()

        unsampled = set(range(len(self.Collection))) - set(sampled)

        if len(unsampled) > 0:
            if sample_size > len(unsampled):
                sample = unsampled
            else:
                sample = random.sample(unsampled, sample_size)

            self.sampled.extend(sample)
            scores = self.score_sample(sample, assignment)
            assignment = self.make_new_assignment(sample, scores, assignment,
                                                  nreassign, choose)
        return assignment

    def merge(self, assignment, label1, label2):
        pvec = ((x if x != label1 else label2)
                for x in assignment.partition_vector)
        return Partition(tuple(pvec))

    def merge_closest(self, assignment):
        print('Finding clusters to merge...')
        clusters = get_clusters(assignment)
        best_score = NEGINF
        merging = [None, None]

        for i in clusters:
            for j in clusters:
                # print('i = {}, j = {}'.format(i, j))
                if i >= j:
                    continue
                print('Testing Clusters {0} and {1}'.format(i, j))
                test_assignment = self.merge(assignment, i, j)
                self.update(test_assignment)
                score_value = self.scorer.score(test_assignment, history=False)

                if score_value > best_score:
                    merging[0] = i
                    merging[1] = j
                    best_score = score_value
                    best_assignment = test_assignment

        print('Merging clusters {0} and {1}'.format(*merging))
        print('Best assignment: {0}'.format(best_assignment))
        return best_assignment

    def split(self, k, assignment, verbosity=1):
        """
        Function to split cluster based on least representative alignment
        """
        if verbosity > 1:
            print(assignment)
        members = get_clusters(assignment)[k]
        if len(members) == 1:
            return assignment
        elif len(members) == 2:
            new_partition_vector = list(assignment.partition_vector)
            new_partition_vector[members[0]] = max(assignment.partition_vector) + 1
            new_assignment = Partition(new_partition_vector)
            return new_assignment

        tree = self.get_cluster_trees(assignment)[k]
        alignment_scores = {}
        if verbosity > 0:
            print('Calculating alignment scores...')

        for i in members:
            r = self.Collection.records[i]
            alignment_scores[i] = self.test(r, tree) / float(r.seqlength)
            # per-site likelihood

        seed, min_score = min(alignment_scores.iteritems(),
                              key=operator.itemgetter(1))
        print('Splitting on {0}.'.format(seed + 1))  # convert to 1-based indexing

        new_assignment = list(assignment.partition_vector)
        new_assignment[seed] = max(assignment.partition_vector) + 1
        if verbosity > 1:
            print('New Partition: {0}'.format(new_assignment))
        if verbosity > 0:
            print('Assigning to new partition...')

        new_assignment = Partition(new_assignment)
        scores = self.score_sample(members, new_assignment)
        assignment = self.make_new_assignment(members, scores, new_assignment,
                                              nreassign=len(members))
        if verbosity > 1:
            print('Returning: {0}'.format(assignment))

        return assignment

    def split_max_var(self, assignment):
        clusters = get_clusters(assignment)
        var_dict = {}

        for k in clusters.keys():
            var_dict[k] = self.var(clusters[k])

        print(var_dict)

        cluster_to_split, var = max(clusters.iteritems(),
                                    key=operator.itemgetter(1))

    def split_search(self, assignment, update=True):
        clusters = get_clusters(assignment)
        k = len(assignment)
        best_score = NEGINF

        for i in clusters:
            print('i: {0}'.format(i))
            test_assignment = self.split(i, assignment)
            # score = self.scorer.score(test_assignment)
            if len(test_assignment) == k + 1:
                curr_score = self.scorer.score(test_assignment, history=False)
                self.update(test_assignment)
            else:
                curr_score = -np.Inf
                print('Something has gone wrong')
            print(test_assignment)
            print(curr_score)

            if curr_score > best_score:
                best_score = curr_score
                best_assignment = test_assignment

        return best_assignment

    def test(self, record, tree, model=None):
        """
        TESTS AN ALIGNMENT AGAINST A TREE TOPOLOGY
        """
        tmp_record = copy.deepcopy(record)

        # if tree label set and record label set don't match
        header_set = set(tmp_record.headers)
        extra_in_tree = tree.labels - header_set
        extra_in_record = header_set - tree.labels

        if extra_in_tree:
            for lab in extra_in_tree:
                tmp_record.headers.append(lab)
                tmp_record.sequences.append(''.join(['-'] * tmp_record.seqlength))
            tmp_record.update()

        if extra_in_record:
            for lab in extra_in_record:
                i = tmp_record.headers.index(lab)
                tmp_record.headers = (tmp_record.headers[:i] +
                                      tmp_record.headers[i + 1:])
                tmp_record.sequences = (tmp_record.sequences[:i] +
                                        tmp_record.sequences[i + 1:])
            tmp_record.update()

        return tmp_alignment.likelihood(tree, self.tmpdir, fit_rates=True)
        # alignment_file = tmp_record.write_phylip('{0}/tmp_alignment.phy'.format(
        # self.tmpdir), interleaved=True)
        # newick_file = tree.write_to_file('{0}/tmp_tree.nwk'.format(self.tmpdir))
        # p = Phyml(tmp_record, self.tmpdir)
        # p.add_tempfile(alignment_file)
        # p.add_tempfile(newick_file)
        # p.add_flag('-i', alignment_file)
        # p.add_flag('-u', newick_file)
        # p.add_flag('-b', '0')    # no bootstraps
        # if tmp_record.datatype == 'dna':
        #     if model is None:
        #         model = 'GTR'
        #     p.add_flag('-m', model)
        #     p.add_flag('-d', 'nt')
        # else:
        #     if model is None:
        #         model = 'WAG'
        #     p.add_flag('-m', model)  # evolutionary model
        #     p.add_flag('-d', 'aa')   # datatype

        # p.add_flag('-o', 'n')    # no optimisation
        # return p.run().score

    def var(self, members):
        score = self.scorer.add(tuple(members)).score
        records = [self.Collection.records[i] for i in members]
        total_length = sum([r.seqlength for r in records])

        return score / total_length

    def optimise(self,
                 assignment,
                 nclusters=None,
                 update=True,
                 history=True,
                 sample_size=10,
                 nreassign=10,
                 max_stayed_put=25,
                 max_resets=5,
                 max_done_worse=5,
                 max_iter=1000):

        if nclusters is None:
            nclusters = self.nclusters

        assignment = self.constrain_assignment(assignment, nclusters)

        local_best_assignment = assignment
        local_best_score = self.scorer.score(local_best_assignment,
                                             history=False)
        current_assignment = local_best_assignment
        self.sampled = []

        print(self.status(current_assignment))

        while True:
            if self.stayed_put > max_stayed_put:
                print('stayed put too many times ({0})'.format(max_stayed_put))
                break
            if self.resets == max_resets:
                print('Reset limit reached ({0})'.format(max_resets))
                break
            if self.done_worse == max_done_worse:
                print('wandered off, resetting...')
                self.resets += 1
                self.done_worse = 0
                current_assignment = local_best_assignment
            if self.i == max_iter:
                print('max iterations reached')
                break

            new_assignment = self.move(sample_size, current_assignment,
                                       nreassign)
            new_assignment = self.constrain_assignment(new_assignment,
                                                       nclusters)
            score = self.scorer.score(new_assignment, history=history)
            self.update(new_assignment)

            if (score - local_best_score) > EPS:
                self.sampled = []
                local_best_score = score
                local_best_assignment = new_assignment
                self.stayed_put = 0
                self.done_worse = 0
                self.resets = 0
                print(self.status(new_assignment, '(Improved)'))
            elif np.abs(score - local_best_score) < EPS:
                self.stayed_put += 1
                self.done_worse = 0
                message = ('(No improvement - [{}/{}])'.format(self.stayed_put,
                                                               max_stayed_put))
                print(self.status(new_assignment, message))
            else:
                self.sampled = []
                # self.stayed_put = 0
                self.done_worse += 1
                message = '(Did worse - [{}/{}]'.format(self.done_worse,
                                                        max_done_worse)
                print(self.status(new_assignment, message))

            self.i += 1

        self._reset_counts()
        return local_best_assignment

    def optimise_with_variable_clusters(self,
                                        assignment,
                                        target_clusters,
                                        max_clusters,
                                        optimise_on_ascent=True,
                                        optimise_on_descent=True,
                                        update=True,
                                        **kwargs):

        if max_clusters < target_clusters:
            raise ValueError('max_clusters ({}) must be at least equal to '
                             'target_clusters ({})'.format(max_clusters, target_clusters))

        current_clusters = len(assignment)
        print('Optimising current assignment with {} clusters. Optimiser will '
              'ascend to {} clusters, and descend to a target of {} clusters'
              '.'.format(current_clusters, max_clusters, target_clusters))
        for n in range(current_clusters, max_clusters + 1):
            print("ASCENDING (optimisation:{}) -> Current target: "
                  "{} clusters".format(('ON' if optimise_on_ascent else 'OFF'),
                                       n))
            if optimise_on_ascent:
                assignment = self.optimise(assignment, nclusters=n, **kwargs)
            else:
                assignment = self.constrain_assignment(assignment, n)

        for n in range(max_clusters - 1, target_clusters - 1, -1):
            print('DESCENDING (optimisation:{}) -> Current target: {} '
                  'clusters'.format(('ON' if optimise_on_descent else 'OFF'),
                                    n))
            if optimise_on_descent:
                assignment = self.optimise(assignment, nclusters=n, **kwargs)
            else:
                assignment = self.constrain_assignment(assignment, n)

        return self.constrain_assignment(assignment, target_clusters)

    def write(self, filename):
        headers = ['Iteration', 'CPU Time', 'Likelihood', 'Partition',
                   'NClusters']
        output = [[i] + x + len(x[-1])
                  for (i, x) in enumerate(self.scorer.history)]

        with open(filename, 'w+') as file_:
            writer = csv.writer(file_, delimiter='\t', quoting=csv.QUOTE_NONE)
            writer.writerow(headers)
            writer.writerows(output)

Example #10

Show file

File: algorithms.py Project: mgperry/treeCl

 def __init__(self, members, records, analysis):
     self.members = tuple(members)
     self.records = [records[i] for i in self.members]
     self.scorer = Scorer(records, analysis)
     self.tree = self.scorer.add(self.members)

Example #11

Show file

File: algorithms.py Project: mgperry/treeCl

class EMTrees(object):
    def __init__(
        self,
        collection,
        nclusters,
        metric='euc',
        tmpdir=None,
    ):

        if not isinstance(nclusters, int) or nclusters <= 1:
            raise Exception('Need appropriate value for number of clusters.')

        self.nclusters = nclusters
        self.scorer = Scorer(collection.records, collection.analysis)  # Could check for entries
        self.datatype = collection.datatype
        self.metric = metric

        try:
            self.tmpdir
        except:
            self.tmpdir = collection.tmpdir

    def clusters_init(self):
        k = self.nclusters
        assignment = [0] * len(self.scorer.records)
        for i in range(k):
            assignment[np.random.randint(0, len(assignment))] = i + 1
        partition = Partition(assignment)
        clusters = [0] * k
        members = partition.get_membership()[1:]
        self.assign_clusters(clusters, members)
        for (index, record) in enumerate(self.scorer.records):
            scores = [self.ml(record, clusters[n]) for n in range(self.nclusters)]
            # print scores
            if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1
        self.partition = Partition(assignment)
        self.L = self.scorer.score(self.partition)

    def random_partition(self):
        self.partition = Partition(tuple(np.random.randint(self.nclusters,
                                   size=len(self.scorer.records))))
        self.L = self.scorer.score(self.partition)

    def assign_clusters(self, clusters, members):
        for n in range(self.nclusters):
            if not clusters[n] or clusters[n].members != members[n]:
                clusters[n] = Cluster(members[n], self.scorer.records, self.scorer.analysis)

        return(clusters)

    def maximise(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            for (index, record) in enumerate(self.scorer.records):
                scores = [alg(record, clusters[n]) for n in range(self.nclusters)]
                # print scores
                if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
                    assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment

            else:
                count += 1
                if count > 1: break  # Algorithm is deterministic so no need for more iterations

    def maximise_random(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0
        sampled = []

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            if index in sampled:
                continue
            else:
                record = self.scorer.records[index]
                sampled.append(index)

            scores = [alg(record, clusters[n]) for n in range(self.nclusters)]

            if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment
                sampled = []
                count = 0
            else:
                count += 1
                if count == len(assignment): break

    def maximise_heuristic(self):
        clusters = [0] * self.nclusters
        sampled = []

        for i in range(1000):
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            record = self.scorer.records[index]
            sampled.append(index)

            lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)]

            a = {'ll': max(lls)}
            a['n'] = lls.index(a['ll'])
            lls.pop(a['n'])

            b = {'ll': max(lls)}
            b['n'] = lls.index(b['ll'])

            a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll']))

            if np.random.uniform() > a['p']:
                choice = a['n']
            else:
                choice = b['n']

            if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = choice + 1

            assignment = Partition(assignment)

            if i % 10 == 0:
                score = self.scorer.score(assignment)

                if score > self.L:
                    self.max_L = score
                    self.max_partition = assignment

    def dist(self, obj1, obj2):
        distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1]
        return(-distance)

    def ml(self, record, cluster, verbose=1):
        p = Phyml(record, tmpdir=self.tmpdir)
        input_tree = os.path.join(self.tmpdir, 'input_tree')
        cluster.tree.write_to_file(input_tree)
        p.add_tempfile(input_tree)
        p.add_flag('--inputtree', input_tree)
        p.add_flag('-o', 'r')  # Optimise only on substitutions`
        p.add_flag('-a', 'e')
        p.add_flag('-b', 0)
        p.add_flag('-c', 4)
        p.add_flag('--quiet', '')

        if self.datatype == 'protein':
            p.add_flag('-d', 'aa')
        elif self.datatype == 'dna':
            p.add_flag('-d', 'nt')

        score = p.run(verbosity=verbose).score
        return(score)