Esempio n. 1
0
 def clusters_init(self):
     k = self.nclusters
     assignment = [0] * len(self.scorer.records)
     for i in range(k):
         assignment[np.random.randint(0, len(assignment))] = i + 1
     partition = Partition(assignment)
     clusters = [0] * k
     members = partition.get_membership()[1:]
     self.assign_clusters(clusters, members)
     for (index, record) in enumerate(self.scorer.records):
         scores = [self.ml(record, clusters[n]) for n in range(self.nclusters)]
         # print scores
         if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
             assignment[index] = scores.index(max(scores)) + 1
     self.partition = Partition(assignment)
     self.L = self.scorer.score(self.partition)
Esempio n. 2
0
    def maximise(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            for (index, record) in enumerate(self.scorer.records):
                scores = [
                    alg(record, clusters[n]) for n in range(self.nclusters)
                ]
                # print scores
                if assignment.count(
                        assignment[index]) > 1 or assignment[index] == 0:
                    assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment

            else:
                count += 1
                if count > 1:
                    break  # Algorithm is deterministic so no need for more iterations
Esempio n. 3
0
    def maximise_random(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0
        sampled = []

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            if index in sampled:
                continue
            else:
                record = self.scorer.records[index]
                sampled.append(index)

            scores = [alg(record, clusters[n]) for n in range(self.nclusters)]

            if assignment.count(
                    assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment
                sampled = []
                count = 0
            else:
                count += 1
                if count == len(assignment): break
Esempio n. 4
0
    def split(self, k, assignment, verbosity=1):
        """
        Function to split cluster based on least representative alignment
        """
        if verbosity > 1:
            print(assignment)
        members = get_clusters(assignment)[k]
        if len(members) == 1:
            return assignment
        elif len(members) == 2:
            new_partition_vector = list(assignment.partition_vector)
            new_partition_vector[members[0]] = max(assignment.partition_vector) + 1
            new_assignment = Partition(new_partition_vector)
            return new_assignment

        tree = self.get_cluster_trees(assignment)[k]
        alignment_scores = {}
        if verbosity > 0:
            print('Calculating alignment scores...')

        for i in members:
            r = self.Collection.records[i]
            alignment_scores[i] = self.test(r, tree) / float(r.seqlength)
            # per-site likelihood

        seed, min_score = min(alignment_scores.iteritems(),
                              key=operator.itemgetter(1))
        print('Splitting on {0}.'.format(seed + 1))  # convert to 1-based indexing

        new_assignment = list(assignment.partition_vector)
        new_assignment[seed] = max(assignment.partition_vector) + 1
        if verbosity > 1:
            print('New Partition: {0}'.format(new_assignment))
        if verbosity > 0:
            print('Assigning to new partition...')

        new_assignment = Partition(new_assignment)
        scores = self.score_sample(members, new_assignment)
        assignment = self.make_new_assignment(members, scores, new_assignment,
                                              nreassign=len(members))
        if verbosity > 1:
            print('Returning: {0}'.format(assignment))

        return assignment
Esempio n. 5
0
def get_partition(clusters):
    seq = clusters if isinstance(clusters, dict) else range(len(clusters))
    length = sum([len(clusters[i]) for i in seq])
    pvec = [0] * length

    for k in seq:
        for i in clusters[k]:
            pvec[i] = k

    return Partition(tuple(pvec))
Esempio n. 6
0
 def clusters_init(self):
     k = self.nclusters
     assignment = [0] * len(self.scorer.records)
     for i in range(k):
         assignment[np.random.randint(0, len(assignment))] = i + 1
     partition = Partition(assignment)
     clusters = [0] * k
     members = partition.get_membership()[1:]
     self.assign_clusters(clusters, members)
     for (index, record) in enumerate(self.scorer.records):
         scores = [
             self.ml(record, clusters[n]) for n in range(self.nclusters)
         ]
         # print scores
         if assignment.count(
                 assignment[index]) > 1 or assignment[index] == 0:
             assignment[index] = scores.index(max(scores)) + 1
     self.partition = Partition(assignment)
     self.L = self.scorer.score(self.partition)
Esempio n. 7
0
    def embedding_plotter(
        self,
        coordinates,
        dimensions,
        partition=None,
        add_sphere=False,
        xlab='PCo1',
        ylab='PCo2',
        zlab='PCo3',
        title='Trees embedded in dimension-reduced space',
        outfile=False,
    ):
        """ Points are coloured according to cluster membership specified
        by Partition object (or all black if no Partition specified) """

        optioncheck(dimensions, [2, 3])
        partition = (partition or Partition(tuple([0] * len(coordinates))))

        colours = zip(
            *zip(range(len(partition)), itertools.cycle('bgrcmyk')))[1]
        print(colours)
        colour_mapping = np.array(
            [colours[i - 1] for i in partition.partition_vector])
        fig = plt.figure()

        if dimensions == 3:
            ax = fig.add_subplot(111,
                                 projection='3d',
                                 xlabel=xlab,
                                 ylabel=ylab,
                                 zlabel=zlab,
                                 title=title)
            if add_sphere:
                ax = self.sphere(ax)

        else:
            ax = fig.add_subplot(111, xlabel=xlab, ylabel=ylab, title=title)

        ax.scatter(*coordinates.T, color=colour_mapping)
        # ax.set_aspect(1)

        if outfile:
            fig.savefig('{0}.pdf'.format(outfile))

        return fig
Esempio n. 8
0
    def maximise_heuristic(self):
        clusters = [0] * self.nclusters
        sampled = []

        for i in range(1000):
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            record = self.scorer.records[index]
            sampled.append(index)

            lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)]

            a = {'ll': max(lls)}
            a['n'] = lls.index(a['ll'])
            lls.pop(a['n'])

            b = {'ll': max(lls)}
            b['n'] = lls.index(b['ll'])

            a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll']))

            if np.random.uniform() > a['p']:
                choice = a['n']
            else:
                choice = b['n']

            if assignment.count(
                    assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = choice + 1

            assignment = Partition(assignment)

            if i % 10 == 0:
                score = self.scorer.score(assignment)

                if score > self.L:
                    self.max_L = score
                    self.max_partition = assignment
Esempio n. 9
0
    def make_new_assignment(self, sample, scores, assignment, nreassign=1,
                            choose='max'):
        """
        MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS
        """
        optioncheck(choose, ('max', 'min'))
        new_clusters = scores.argmax(axis=1)
        M = scores / scores.sum(axis=1)[:, np.newaxis]
        if choose == 'max':
            reassignments = M.max(axis=1).argsort()[-nreassign:]
        else:
            reassignments = M.min(axis=1).argsort()[:nreassign]

        new_assignment = list(assignment.partition_vector)

        for i in reassignments:
            new_assignment[sample[i]] = new_clusters[i] + 1
            # because cluster number is in range
            # [1,x], and new_clusters is in range [0,x-1]

        return Partition(tuple(new_assignment))
Esempio n. 10
0
    def __init__(self, nclusters, collection, tmpdir=TMPDIR,
                 analysis='nj', initial_assignment=None, scorer=None):
        optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection'])
        if self.analysis == 'tc':
            self.analysis = 'TreeCollection'
        else:
            self.analysis = analysis

        self.Collection = collection

        if not self.Collection.records[0].tree:
            print('Calculating {} trees for collection...'.format(analysis))
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        if scorer is not None and isinstance(scorer, Scorer):
            self.scorer = scorer
        else:
            self.scorer = Scorer(self.Collection)

        self.nclusters = nclusters
        self.tmpdir = tmpdir

        print('Calculating initial scores...')
        if initial_assignment is None:
            initial_assignment = Partition(tuple([0] * len(collection)))
            # initial_assignment = self.random_partition(nclusters)

        self.global_best_scores = {}
        self.global_best_assignments = {}
        self.global_best_scores[self.nclusters] = self.scorer.score(
            initial_assignment, history=True)
        self.global_best_assignments[self.nclusters] = initial_assignment

        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0
Esempio n. 11
0
class EMTrees(object):
    def __init__(
        self,
        collection,
        nclusters,
        metric='euc',
        tmpdir=None,
    ):

        if not isinstance(nclusters, int) or nclusters <= 1:
            raise Exception('Need appropriate value for number of clusters.')

        self.nclusters = nclusters
        self.scorer = Scorer(collection.records, collection.analysis)  # Could check for entries
        self.datatype = collection.datatype
        self.metric = metric

        try:
            self.tmpdir
        except:
            self.tmpdir = collection.tmpdir

    def clusters_init(self):
        k = self.nclusters
        assignment = [0] * len(self.scorer.records)
        for i in range(k):
            assignment[np.random.randint(0, len(assignment))] = i + 1
        partition = Partition(assignment)
        clusters = [0] * k
        members = partition.get_membership()[1:]
        self.assign_clusters(clusters, members)
        for (index, record) in enumerate(self.scorer.records):
            scores = [self.ml(record, clusters[n]) for n in range(self.nclusters)]
            # print scores
            if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1
        self.partition = Partition(assignment)
        self.L = self.scorer.score(self.partition)

    def random_partition(self):
        self.partition = Partition(tuple(np.random.randint(self.nclusters,
                                   size=len(self.scorer.records))))
        self.L = self.scorer.score(self.partition)

    def assign_clusters(self, clusters, members):
        for n in range(self.nclusters):
            if not clusters[n] or clusters[n].members != members[n]:
                clusters[n] = Cluster(members[n], self.scorer.records, self.scorer.analysis)

        return(clusters)

    def maximise(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            for (index, record) in enumerate(self.scorer.records):
                scores = [alg(record, clusters[n]) for n in range(self.nclusters)]
                # print scores
                if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
                    assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment

            else:
                count += 1
                if count > 1: break  # Algorithm is deterministic so no need for more iterations

    def maximise_random(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0
        sampled = []

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            if index in sampled:
                continue
            else:
                record = self.scorer.records[index]
                sampled.append(index)

            scores = [alg(record, clusters[n]) for n in range(self.nclusters)]

            if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment
                sampled = []
                count = 0
            else:
                count += 1
                if count == len(assignment): break

    def maximise_heuristic(self):
        clusters = [0] * self.nclusters
        sampled = []

        for i in range(1000):
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            record = self.scorer.records[index]
            sampled.append(index)

            lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)]

            a = {'ll': max(lls)}
            a['n'] = lls.index(a['ll'])
            lls.pop(a['n'])

            b = {'ll': max(lls)}
            b['n'] = lls.index(b['ll'])

            a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll']))

            if np.random.uniform() > a['p']:
                choice = a['n']
            else:
                choice = b['n']

            if assignment.count(assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = choice + 1

            assignment = Partition(assignment)

            if i % 10 == 0:
                score = self.scorer.score(assignment)

                if score > self.L:
                    self.max_L = score
                    self.max_partition = assignment

    def dist(self, obj1, obj2):
        distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1]
        return(-distance)

    def ml(self, record, cluster, verbose=1):
        p = Phyml(record, tmpdir=self.tmpdir)
        input_tree = os.path.join(self.tmpdir, 'input_tree')
        cluster.tree.write_to_file(input_tree)
        p.add_tempfile(input_tree)
        p.add_flag('--inputtree', input_tree)
        p.add_flag('-o', 'r')  # Optimise only on substitutions`
        p.add_flag('-a', 'e')
        p.add_flag('-b', 0)
        p.add_flag('-c', 4)
        p.add_flag('--quiet', '')

        if self.datatype == 'protein':
            p.add_flag('-d', 'aa')
        elif self.datatype == 'dna':
            p.add_flag('-d', 'nt')

        score = p.run(verbosity=verbose).score
        return(score)
Esempio n. 12
0
 def random_partition(self):
     self.partition = Partition(tuple(np.random.randint(self.nclusters,
                                size=len(self.scorer.records))))
     self.L = self.scorer.score(self.partition)
Esempio n. 13
0
 def random_partition(self):
     self.partition = Partition(
         tuple(
             np.random.randint(self.nclusters,
                               size=len(self.scorer.records))))
     self.L = self.scorer.score(self.partition)
Esempio n. 14
0
class EMTrees(object):
    def __init__(
        self,
        collection,
        nclusters,
        metric='euc',
        tmpdir=None,
    ):

        if not isinstance(nclusters, int) or nclusters <= 1:
            raise Exception('Need appropriate value for number of clusters.')

        self.nclusters = nclusters
        self.scorer = Scorer(collection.records,
                             collection.analysis)  # Could check for entries
        self.datatype = collection.datatype
        self.metric = metric

        try:
            self.tmpdir
        except:
            self.tmpdir = collection.tmpdir

    def clusters_init(self):
        k = self.nclusters
        assignment = [0] * len(self.scorer.records)
        for i in range(k):
            assignment[np.random.randint(0, len(assignment))] = i + 1
        partition = Partition(assignment)
        clusters = [0] * k
        members = partition.get_membership()[1:]
        self.assign_clusters(clusters, members)
        for (index, record) in enumerate(self.scorer.records):
            scores = [
                self.ml(record, clusters[n]) for n in range(self.nclusters)
            ]
            # print scores
            if assignment.count(
                    assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1
        self.partition = Partition(assignment)
        self.L = self.scorer.score(self.partition)

    def random_partition(self):
        self.partition = Partition(
            tuple(
                np.random.randint(self.nclusters,
                                  size=len(self.scorer.records))))
        self.L = self.scorer.score(self.partition)

    def assign_clusters(self, clusters, members):
        for n in range(self.nclusters):
            if not clusters[n] or clusters[n].members != members[n]:
                clusters[n] = Cluster(members[n], self.scorer.records,
                                      self.scorer.analysis)

        return (clusters)

    def maximise(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            for (index, record) in enumerate(self.scorer.records):
                scores = [
                    alg(record, clusters[n]) for n in range(self.nclusters)
                ]
                # print scores
                if assignment.count(
                        assignment[index]) > 1 or assignment[index] == 0:
                    assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment

            else:
                count += 1
                if count > 1:
                    break  # Algorithm is deterministic so no need for more iterations

    def maximise_random(self, method):
        clusters = [0] * self.nclusters
        alg = getattr(self, method)
        count = 0
        sampled = []

        while True:
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            if index in sampled:
                continue
            else:
                record = self.scorer.records[index]
                sampled.append(index)

            scores = [alg(record, clusters[n]) for n in range(self.nclusters)]

            if assignment.count(
                    assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = scores.index(max(scores)) + 1

            assignment = Partition(assignment)
            score = self.scorer.score(assignment)

            if score > self.L:
                self.L = score
                self.partition = assignment
                sampled = []
                count = 0
            else:
                count += 1
                if count == len(assignment): break

    def maximise_heuristic(self):
        clusters = [0] * self.nclusters
        sampled = []

        for i in range(1000):
            self.assign_clusters(clusters, self.partition.get_membership())
            assignment = list(self.partition.partition_vector)

            index = randint(0, len(self.scorer.records) - 1)

            record = self.scorer.records[index]
            sampled.append(index)

            lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)]

            a = {'ll': max(lls)}
            a['n'] = lls.index(a['ll'])
            lls.pop(a['n'])

            b = {'ll': max(lls)}
            b['n'] = lls.index(b['ll'])

            a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll']))

            if np.random.uniform() > a['p']:
                choice = a['n']
            else:
                choice = b['n']

            if assignment.count(
                    assignment[index]) > 1 or assignment[index] == 0:
                assignment[index] = choice + 1

            assignment = Partition(assignment)

            if i % 10 == 0:
                score = self.scorer.score(assignment)

                if score > self.L:
                    self.max_L = score
                    self.max_partition = assignment

    def dist(self, obj1, obj2):
        distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1]
        return (-distance)

    def ml(self, record, cluster, verbose=1):
        p = Phyml(record, tmpdir=self.tmpdir)
        input_tree = os.path.join(self.tmpdir, 'input_tree')
        cluster.tree.write_to_file(input_tree)
        p.add_tempfile(input_tree)
        p.add_flag('--inputtree', input_tree)
        p.add_flag('-o', 'r')  # Optimise only on substitutions`
        p.add_flag('-a', 'e')
        p.add_flag('-b', 0)
        p.add_flag('-c', 4)
        p.add_flag('--quiet', '')

        if self.datatype == 'protein':
            p.add_flag('-d', 'aa')
        elif self.datatype == 'dna':
            p.add_flag('-d', 'nt')

        score = p.run(verbosity=verbose).score
        return (score)
Esempio n. 15
0
def likelihood(partition, scorer):
    score = scorer.score(Partition(partition))
    return(score)
Esempio n. 16
0
 def get_true_partition(self):
     l = []
     for k in range(len(self.class_list)):
         l.extend([k + 1] * self.class_list[k])
     self.true_partition = Partition(l)
     return self.true_partition
Esempio n. 17
0
 def random_partition(self, nclusters):
     return Partition(tuple(np.random.randint(nclusters,
                                              size=len(self.Collection))))
Esempio n. 18
0
def get_partition_from_file(filename):
    with open(filename) as f:
        pvec = [int(x) for x in f.readline().split()]

    return Partition(tuple(pvec))
Esempio n. 19
0
    print 'yep'

    print 'Can build Plotter from Collection + DistanceMatrix...',
    plotter_with_dm = Plotter(c, dm=dm)
    print 'yes'

    print 'Can build Plotter from a list of TrClSeq objects...',
    plotter_from_records = Plotter(records=c.records)
    print 'yes'

    print 'Can build Plotter from DistanceMatrix only...',
    plotter_just_dm = Plotter(dm=dm)
    print 'yes'

    print 'Testing plotting'
    p = Partition(tuple([1] * 15 + [2] * 15 + [3] * 15 + [4] * 15))
    p_rand = Partition(
        tuple([
            1, 3, 1, 4, 2, 3, 3, 3, 2, 2, 1, 3, 3, 4, 1, 4, 1, 1, 2, 4, 1, 2,
            2, 2, 2, 2, 3, 4, 2, 2, 1, 4, 3, 1, 4, 4, 3, 1, 3, 1, 3, 2, 4, 4,
            1, 4, 1, 2, 3, 4, 2, 4, 3, 2, 1, 3, 4, 4, 1, 3
        ]))
    fig1 = plotter_from_collection.embedding('MDS', 2, p)  # 2d MDS embedding
    fig2 = plotter_from_collection.embedding('MDS', 3, p)  # 3d MDS embedding
    fig3 = plotter_from_collection.embedding('spectral', 2,
                                             p_rand)  # 2d spectral
    fig4 = plotter_from_collection.embedding('spectral', 3,
                                             p_rand)  # 3d spectral
    fig5 = plotter_just_dm.heatmap(p)  # distance matrix as
    fig6 = plotter_just_dm.heatmap(p_rand)
    plt.show()
Esempio n. 20
0
 def merge(self, assignment, label1, label2):
     pvec = ((x if x != label1 else label2)
             for x in assignment.partition_vector)
     return Partition(tuple(pvec))
Esempio n. 21
0
        if not len(p) == len(c):
            print('Partition is of incorrect length '
                  '(expected {0}, got {1}'.format(len(c), len(p)))
            sys.exit(1)

        o = Optimiser(args.nclusters, c, tmpdir=new_tmpdir,
                      initial_assignment=p)

    else:
        o = Optimiser(args.nclusters, c, tmpdir=new_tmpdir)

    # Hierarchical clustering via likelihood
    if args.hierarchical is not None:
        if args.hierarchical == 'top_down':
            p = Partition(tuple([1] * len(c)))
        elif args.hierarchical == 'bottom_up':
            p = Partition(range(1, len(c) + 1))

        result = o.constrain_assignment(p, args.nclusters)
        # o.Scorer.clear_history()
        score = o.Scorer.score(result)
        o.global_best_assignments[args.nclusters] = result
        o.global_best_scores[args.nclusters] = score

    # Quit early
    if args.quit:
        pass

    else:
        if args.merge is True: