def clusters_init(self): k = self.nclusters assignment = [0] * len(self.scorer.records) for i in range(k): assignment[np.random.randint(0, len(assignment))] = i + 1 partition = Partition(assignment) clusters = [0] * k members = partition.get_membership()[1:] self.assign_clusters(clusters, members) for (index, record) in enumerate(self.scorer.records): scores = [self.ml(record, clusters[n]) for n in range(self.nclusters)] # print scores if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 self.partition = Partition(assignment) self.L = self.scorer.score(self.partition)
def maximise(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) for (index, record) in enumerate(self.scorer.records): scores = [ alg(record, clusters[n]) for n in range(self.nclusters) ] # print scores if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment else: count += 1 if count > 1: break # Algorithm is deterministic so no need for more iterations
def maximise_random(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 sampled = [] while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) if index in sampled: continue else: record = self.scorer.records[index] sampled.append(index) scores = [alg(record, clusters[n]) for n in range(self.nclusters)] if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment sampled = [] count = 0 else: count += 1 if count == len(assignment): break
def split(self, k, assignment, verbosity=1): """ Function to split cluster based on least representative alignment """ if verbosity > 1: print(assignment) members = get_clusters(assignment)[k] if len(members) == 1: return assignment elif len(members) == 2: new_partition_vector = list(assignment.partition_vector) new_partition_vector[members[0]] = max(assignment.partition_vector) + 1 new_assignment = Partition(new_partition_vector) return new_assignment tree = self.get_cluster_trees(assignment)[k] alignment_scores = {} if verbosity > 0: print('Calculating alignment scores...') for i in members: r = self.Collection.records[i] alignment_scores[i] = self.test(r, tree) / float(r.seqlength) # per-site likelihood seed, min_score = min(alignment_scores.iteritems(), key=operator.itemgetter(1)) print('Splitting on {0}.'.format(seed + 1)) # convert to 1-based indexing new_assignment = list(assignment.partition_vector) new_assignment[seed] = max(assignment.partition_vector) + 1 if verbosity > 1: print('New Partition: {0}'.format(new_assignment)) if verbosity > 0: print('Assigning to new partition...') new_assignment = Partition(new_assignment) scores = self.score_sample(members, new_assignment) assignment = self.make_new_assignment(members, scores, new_assignment, nreassign=len(members)) if verbosity > 1: print('Returning: {0}'.format(assignment)) return assignment
def get_partition(clusters): seq = clusters if isinstance(clusters, dict) else range(len(clusters)) length = sum([len(clusters[i]) for i in seq]) pvec = [0] * length for k in seq: for i in clusters[k]: pvec[i] = k return Partition(tuple(pvec))
def clusters_init(self): k = self.nclusters assignment = [0] * len(self.scorer.records) for i in range(k): assignment[np.random.randint(0, len(assignment))] = i + 1 partition = Partition(assignment) clusters = [0] * k members = partition.get_membership()[1:] self.assign_clusters(clusters, members) for (index, record) in enumerate(self.scorer.records): scores = [ self.ml(record, clusters[n]) for n in range(self.nclusters) ] # print scores if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 self.partition = Partition(assignment) self.L = self.scorer.score(self.partition)
def embedding_plotter( self, coordinates, dimensions, partition=None, add_sphere=False, xlab='PCo1', ylab='PCo2', zlab='PCo3', title='Trees embedded in dimension-reduced space', outfile=False, ): """ Points are coloured according to cluster membership specified by Partition object (or all black if no Partition specified) """ optioncheck(dimensions, [2, 3]) partition = (partition or Partition(tuple([0] * len(coordinates)))) colours = zip( *zip(range(len(partition)), itertools.cycle('bgrcmyk')))[1] print(colours) colour_mapping = np.array( [colours[i - 1] for i in partition.partition_vector]) fig = plt.figure() if dimensions == 3: ax = fig.add_subplot(111, projection='3d', xlabel=xlab, ylabel=ylab, zlabel=zlab, title=title) if add_sphere: ax = self.sphere(ax) else: ax = fig.add_subplot(111, xlabel=xlab, ylabel=ylab, title=title) ax.scatter(*coordinates.T, color=colour_mapping) # ax.set_aspect(1) if outfile: fig.savefig('{0}.pdf'.format(outfile)) return fig
def maximise_heuristic(self): clusters = [0] * self.nclusters sampled = [] for i in range(1000): self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) record = self.scorer.records[index] sampled.append(index) lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)] a = {'ll': max(lls)} a['n'] = lls.index(a['ll']) lls.pop(a['n']) b = {'ll': max(lls)} b['n'] = lls.index(b['ll']) a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll'])) if np.random.uniform() > a['p']: choice = a['n'] else: choice = b['n'] if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = choice + 1 assignment = Partition(assignment) if i % 10 == 0: score = self.scorer.score(assignment) if score > self.L: self.max_L = score self.max_partition = assignment
def make_new_assignment(self, sample, scores, assignment, nreassign=1, choose='max'): """ MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS """ optioncheck(choose, ('max', 'min')) new_clusters = scores.argmax(axis=1) M = scores / scores.sum(axis=1)[:, np.newaxis] if choose == 'max': reassignments = M.max(axis=1).argsort()[-nreassign:] else: reassignments = M.min(axis=1).argsort()[:nreassign] new_assignment = list(assignment.partition_vector) for i in reassignments: new_assignment[sample[i]] = new_clusters[i] + 1 # because cluster number is in range # [1,x], and new_clusters is in range [0,x-1] return Partition(tuple(new_assignment))
def __init__(self, nclusters, collection, tmpdir=TMPDIR, analysis='nj', initial_assignment=None, scorer=None): optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection']) if self.analysis == 'tc': self.analysis = 'TreeCollection' else: self.analysis = analysis self.Collection = collection if not self.Collection.records[0].tree: print('Calculating {} trees for collection...'.format(analysis)) self.Collection.calc_NJ_trees() self.datatype = collection.datatype if scorer is not None and isinstance(scorer, Scorer): self.scorer = scorer else: self.scorer = Scorer(self.Collection) self.nclusters = nclusters self.tmpdir = tmpdir print('Calculating initial scores...') if initial_assignment is None: initial_assignment = Partition(tuple([0] * len(collection))) # initial_assignment = self.random_partition(nclusters) self.global_best_scores = {} self.global_best_assignments = {} self.global_best_scores[self.nclusters] = self.scorer.score( initial_assignment, history=True) self.global_best_assignments[self.nclusters] = initial_assignment self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 self.merges = 0
class EMTrees(object): def __init__( self, collection, nclusters, metric='euc', tmpdir=None, ): if not isinstance(nclusters, int) or nclusters <= 1: raise Exception('Need appropriate value for number of clusters.') self.nclusters = nclusters self.scorer = Scorer(collection.records, collection.analysis) # Could check for entries self.datatype = collection.datatype self.metric = metric try: self.tmpdir except: self.tmpdir = collection.tmpdir def clusters_init(self): k = self.nclusters assignment = [0] * len(self.scorer.records) for i in range(k): assignment[np.random.randint(0, len(assignment))] = i + 1 partition = Partition(assignment) clusters = [0] * k members = partition.get_membership()[1:] self.assign_clusters(clusters, members) for (index, record) in enumerate(self.scorer.records): scores = [self.ml(record, clusters[n]) for n in range(self.nclusters)] # print scores if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 self.partition = Partition(assignment) self.L = self.scorer.score(self.partition) def random_partition(self): self.partition = Partition(tuple(np.random.randint(self.nclusters, size=len(self.scorer.records)))) self.L = self.scorer.score(self.partition) def assign_clusters(self, clusters, members): for n in range(self.nclusters): if not clusters[n] or clusters[n].members != members[n]: clusters[n] = Cluster(members[n], self.scorer.records, self.scorer.analysis) return(clusters) def maximise(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) for (index, record) in enumerate(self.scorer.records): scores = [alg(record, clusters[n]) for n in range(self.nclusters)] # print scores if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment else: count += 1 if count > 1: break # Algorithm is deterministic so no need for more iterations def maximise_random(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 sampled = [] while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) if index in sampled: continue else: record = self.scorer.records[index] sampled.append(index) scores = [alg(record, clusters[n]) for n in range(self.nclusters)] if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment sampled = [] count = 0 else: count += 1 if count == len(assignment): break def maximise_heuristic(self): clusters = [0] * self.nclusters sampled = [] for i in range(1000): self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) record = self.scorer.records[index] sampled.append(index) lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)] a = {'ll': max(lls)} a['n'] = lls.index(a['ll']) lls.pop(a['n']) b = {'ll': max(lls)} b['n'] = lls.index(b['ll']) a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll'])) if np.random.uniform() > a['p']: choice = a['n'] else: choice = b['n'] if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = choice + 1 assignment = Partition(assignment) if i % 10 == 0: score = self.scorer.score(assignment) if score > self.L: self.max_L = score self.max_partition = assignment def dist(self, obj1, obj2): distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1] return(-distance) def ml(self, record, cluster, verbose=1): p = Phyml(record, tmpdir=self.tmpdir) input_tree = os.path.join(self.tmpdir, 'input_tree') cluster.tree.write_to_file(input_tree) p.add_tempfile(input_tree) p.add_flag('--inputtree', input_tree) p.add_flag('-o', 'r') # Optimise only on substitutions` p.add_flag('-a', 'e') p.add_flag('-b', 0) p.add_flag('-c', 4) p.add_flag('--quiet', '') if self.datatype == 'protein': p.add_flag('-d', 'aa') elif self.datatype == 'dna': p.add_flag('-d', 'nt') score = p.run(verbosity=verbose).score return(score)
def random_partition(self): self.partition = Partition(tuple(np.random.randint(self.nclusters, size=len(self.scorer.records)))) self.L = self.scorer.score(self.partition)
def random_partition(self): self.partition = Partition( tuple( np.random.randint(self.nclusters, size=len(self.scorer.records)))) self.L = self.scorer.score(self.partition)
class EMTrees(object): def __init__( self, collection, nclusters, metric='euc', tmpdir=None, ): if not isinstance(nclusters, int) or nclusters <= 1: raise Exception('Need appropriate value for number of clusters.') self.nclusters = nclusters self.scorer = Scorer(collection.records, collection.analysis) # Could check for entries self.datatype = collection.datatype self.metric = metric try: self.tmpdir except: self.tmpdir = collection.tmpdir def clusters_init(self): k = self.nclusters assignment = [0] * len(self.scorer.records) for i in range(k): assignment[np.random.randint(0, len(assignment))] = i + 1 partition = Partition(assignment) clusters = [0] * k members = partition.get_membership()[1:] self.assign_clusters(clusters, members) for (index, record) in enumerate(self.scorer.records): scores = [ self.ml(record, clusters[n]) for n in range(self.nclusters) ] # print scores if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 self.partition = Partition(assignment) self.L = self.scorer.score(self.partition) def random_partition(self): self.partition = Partition( tuple( np.random.randint(self.nclusters, size=len(self.scorer.records)))) self.L = self.scorer.score(self.partition) def assign_clusters(self, clusters, members): for n in range(self.nclusters): if not clusters[n] or clusters[n].members != members[n]: clusters[n] = Cluster(members[n], self.scorer.records, self.scorer.analysis) return (clusters) def maximise(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) for (index, record) in enumerate(self.scorer.records): scores = [ alg(record, clusters[n]) for n in range(self.nclusters) ] # print scores if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment else: count += 1 if count > 1: break # Algorithm is deterministic so no need for more iterations def maximise_random(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 sampled = [] while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) if index in sampled: continue else: record = self.scorer.records[index] sampled.append(index) scores = [alg(record, clusters[n]) for n in range(self.nclusters)] if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment sampled = [] count = 0 else: count += 1 if count == len(assignment): break def maximise_heuristic(self): clusters = [0] * self.nclusters sampled = [] for i in range(1000): self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) record = self.scorer.records[index] sampled.append(index) lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)] a = {'ll': max(lls)} a['n'] = lls.index(a['ll']) lls.pop(a['n']) b = {'ll': max(lls)} b['n'] = lls.index(b['ll']) a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll'])) if np.random.uniform() > a['p']: choice = a['n'] else: choice = b['n'] if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = choice + 1 assignment = Partition(assignment) if i % 10 == 0: score = self.scorer.score(assignment) if score > self.L: self.max_L = score self.max_partition = assignment def dist(self, obj1, obj2): distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1] return (-distance) def ml(self, record, cluster, verbose=1): p = Phyml(record, tmpdir=self.tmpdir) input_tree = os.path.join(self.tmpdir, 'input_tree') cluster.tree.write_to_file(input_tree) p.add_tempfile(input_tree) p.add_flag('--inputtree', input_tree) p.add_flag('-o', 'r') # Optimise only on substitutions` p.add_flag('-a', 'e') p.add_flag('-b', 0) p.add_flag('-c', 4) p.add_flag('--quiet', '') if self.datatype == 'protein': p.add_flag('-d', 'aa') elif self.datatype == 'dna': p.add_flag('-d', 'nt') score = p.run(verbosity=verbose).score return (score)
def likelihood(partition, scorer): score = scorer.score(Partition(partition)) return(score)
def get_true_partition(self): l = [] for k in range(len(self.class_list)): l.extend([k + 1] * self.class_list[k]) self.true_partition = Partition(l) return self.true_partition
def random_partition(self, nclusters): return Partition(tuple(np.random.randint(nclusters, size=len(self.Collection))))
def get_partition_from_file(filename): with open(filename) as f: pvec = [int(x) for x in f.readline().split()] return Partition(tuple(pvec))
print 'yep' print 'Can build Plotter from Collection + DistanceMatrix...', plotter_with_dm = Plotter(c, dm=dm) print 'yes' print 'Can build Plotter from a list of TrClSeq objects...', plotter_from_records = Plotter(records=c.records) print 'yes' print 'Can build Plotter from DistanceMatrix only...', plotter_just_dm = Plotter(dm=dm) print 'yes' print 'Testing plotting' p = Partition(tuple([1] * 15 + [2] * 15 + [3] * 15 + [4] * 15)) p_rand = Partition( tuple([ 1, 3, 1, 4, 2, 3, 3, 3, 2, 2, 1, 3, 3, 4, 1, 4, 1, 1, 2, 4, 1, 2, 2, 2, 2, 2, 3, 4, 2, 2, 1, 4, 3, 1, 4, 4, 3, 1, 3, 1, 3, 2, 4, 4, 1, 4, 1, 2, 3, 4, 2, 4, 3, 2, 1, 3, 4, 4, 1, 3 ])) fig1 = plotter_from_collection.embedding('MDS', 2, p) # 2d MDS embedding fig2 = plotter_from_collection.embedding('MDS', 3, p) # 3d MDS embedding fig3 = plotter_from_collection.embedding('spectral', 2, p_rand) # 2d spectral fig4 = plotter_from_collection.embedding('spectral', 3, p_rand) # 3d spectral fig5 = plotter_just_dm.heatmap(p) # distance matrix as fig6 = plotter_just_dm.heatmap(p_rand) plt.show()
def merge(self, assignment, label1, label2): pvec = ((x if x != label1 else label2) for x in assignment.partition_vector) return Partition(tuple(pvec))
if not len(p) == len(c): print('Partition is of incorrect length ' '(expected {0}, got {1}'.format(len(c), len(p))) sys.exit(1) o = Optimiser(args.nclusters, c, tmpdir=new_tmpdir, initial_assignment=p) else: o = Optimiser(args.nclusters, c, tmpdir=new_tmpdir) # Hierarchical clustering via likelihood if args.hierarchical is not None: if args.hierarchical == 'top_down': p = Partition(tuple([1] * len(c))) elif args.hierarchical == 'bottom_up': p = Partition(range(1, len(c) + 1)) result = o.constrain_assignment(p, args.nclusters) # o.Scorer.clear_history() score = o.Scorer.score(result) o.global_best_assignments[args.nclusters] = result o.global_best_scores[args.nclusters] = score # Quit early if args.quit: pass else: if args.merge is True: