def __init__( self, collection, nclusters, metric='euc', tmpdir=None, ): if not isinstance(nclusters, int) or nclusters <= 1: raise Exception('Need appropriate value for number of clusters.') self.nclusters = nclusters self.scorer = Scorer(collection.records, collection.analysis) # Could check for entries self.datatype = collection.datatype self.metric = metric try: self.tmpdir except: self.tmpdir = collection.tmpdir
def __init__(self, nclusters, collection, tmpdir=TMPDIR, analysis='nj', initial_assignment=None, scorer=None): optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection']) if self.analysis == 'tc': self.analysis = 'TreeCollection' else: self.analysis = analysis self.Collection = collection if not self.Collection.records[0].tree: print('Calculating {} trees for collection...'.format(analysis)) self.Collection.calc_NJ_trees() self.datatype = collection.datatype if scorer is not None and isinstance(scorer, Scorer): self.scorer = scorer else: self.scorer = Scorer(self.Collection) self.nclusters = nclusters self.tmpdir = tmpdir print('Calculating initial scores...') if initial_assignment is None: initial_assignment = Partition(tuple([0] * len(collection))) # initial_assignment = self.random_partition(nclusters) self.global_best_scores = {} self.global_best_assignments = {} self.global_best_scores[self.nclusters] = self.scorer.score( initial_assignment, history=True) self.global_best_assignments[self.nclusters] = initial_assignment self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 self.merges = 0
def __init__(self, nclusters, collection, tmpdir=TMPDIR, analysis='nj', initial_assignment=None, scorer=None): optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection']) if self.analysis == 'tc': self.analysis = 'TreeCollection' else: self.analysis = analysis self.Collection = collection if not self.Collection.records[0].tree: print('Calculating {} trees for collection...'.format(analysis)) self.Collection.calc_NJ_trees() self.datatype = collection.datatype if scorer is not None and isinstance(scorer, Scorer): self.scorer = scorer else: self.scorer = Scorer(self.Collection.records, analysis=analysis, datatype=self.datatype, tmpdir=tmpdir) self.nclusters = nclusters self.tmpdir = tmpdir print('Calculating initial scores...') if initial_assignment is None: initial_assignment = Partition(tuple([0]*len(collection))) # initial_assignment = self.random_partition(nclusters) self.global_best_scores = {} self.global_best_assignments = {} self.global_best_scores[self.nclusters] = self.scorer.score( initial_assignment, history=True) self.global_best_assignments[self.nclusters] = initial_assignment self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 self.merges = 0
class Optimiser(object): def __init__(self, nclusters, collection, tmpdir=TMPDIR, analysis='nj', initial_assignment=None, scorer=None): optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection']) if self.analysis == 'tc': self.analysis = 'TreeCollection' else: self.analysis = analysis self.Collection = collection if not self.Collection.records[0].tree: print('Calculating {} trees for collection...'.format(analysis)) self.Collection.calc_NJ_trees() self.datatype = collection.datatype if scorer is not None and isinstance(scorer, Scorer): self.scorer = scorer else: self.scorer = Scorer(self.Collection.records, analysis=analysis, datatype=self.datatype, tmpdir=tmpdir) self.nclusters = nclusters self.tmpdir = tmpdir print('Calculating initial scores...') if initial_assignment is None: initial_assignment = Partition(tuple([0]*len(collection))) # initial_assignment = self.random_partition(nclusters) self.global_best_scores = {} self.global_best_assignments = {} self.global_best_scores[self.nclusters] = self.scorer.score( initial_assignment, history=True) self.global_best_assignments[self.nclusters] = initial_assignment self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 self.merges = 0 def _reset_counts(self): self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 def status(self, current_assignment, details=None): iter_ = self.i n = len(current_assignment) curr_score = self.scorer.score(current_assignment, history=False) best_score = self.global_best_scores[n] details = ('\t'+str(details) if details is not None else '') return 'Iter:{0}\tNclusters:{1}\tCurrent\tscore:{2}\tBest score:{3}{4}'.format( iter_, n, curr_score, best_score, details) def random_partition(self, nclusters): return Partition(tuple(np.random.randint(nclusters, size=len(self.Collection)))) def update(self, assignment): """ method for working interactively and keeping nclusters correct """ nclusters = len(assignment) # len(assignment) == number of clusters best_score = self.global_best_scores.get(nclusters, NEGINF) curr_score = self.scorer.score(assignment, history=False) if (curr_score - best_score) > EPS: self.global_best_assignments[nclusters] = assignment self.global_best_scores[nclusters] = self.scorer.score(assignment, history=False) def get_clusters(self, assignment): pvec = assignment.partition_vector index_dict = defaultdict(list) for (position, value) in enumerate(pvec): index_dict[value].append(position) return index_dict def get_cluster_trees(self, assignment, index_dict=None): index_dict = (index_dict or self.get_clusters(assignment)) tree_dict = {} for (k, v) in index_dict.items(): if not tuple(v) in self.scorer.concats: self.scorer.add(tuple(v)) tree_dict[k] = self.scorer.concats[tuple(v)] return tree_dict def score_sample(self, sample, assignment): """ !! changed to simply SCORE a PRE-MADE SAMPLE sample_size:int, assignment:Partition object Calculates score m*n score matrix, where m is number of alignments in the sample, and n in the number of clusters encoded in the assignment (==Partition object) """ # sample = random.sample(range(len(self.Collection)), sample_size) cluster_trees = self.get_cluster_trees(assignment) scores = np.zeros((len(sample), len(cluster_trees))) for i, record_index in enumerate(sample): rec = self.Collection.records[record_index] for j, tree in cluster_trees.items(): scores[i, j-1] = self.test(rec, tree) return (scores) def constrain_assignment(self, assignment, nclusters=None): """ Constrain the assignment to have self.nclusters clusters """ if nclusters is None: nclusters = self.nclusters if (nclusters < 1) or (nclusters > len(self.Collection)): raise ValueError('Invalid number of clusters: {}'.format(nclusters)) while len(assignment.get_membership()) > nclusters: assignment = self.merge_closest(assignment) while len(assignment.get_membership()) < nclusters: assignment = self.split_search(assignment) return assignment def make_new_assignment(self, sample, scores, assignment, nreassign=1, choose='max'): """ MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS """ new_clusters = scores.argmax(axis=1) M = scores/scores.sum(axis=1)[:, np.newaxis] if choose == 'max': reassignments = M.max(axis=1).argsort()[-nreassign:] elif choose == 'min': reassignments = M.min(axis=1).argsort()[:nreassign] new_assignment = list(assignment.partition_vector) for i in reassignments: new_assignment[sample[i]] = new_clusters[i]+1 # because cluster number is in range # [1,x], and new_clusters is in range [0,x-1] return Partition(tuple(new_assignment)) def move(self, sample_size, assignment, nreassign=1, choose='max', sampled=None): """ !! now generates own sample and passes to scores wraps self.score_sample + self.new_assignment """ if sampled is None: sampled = list() unsampled = set(range(len(self.Collection))) - set(sampled) if len(unsampled) > 0: if sample_size > len(unsampled): sample = unsampled else: sample = random.sample(unsampled, sample_size) self.sampled.extend(sample) scores = self.score_sample(sample, assignment) assignment = self.make_new_assignment(sample, scores, assignment, nreassign, choose) return assignment def merge(self, assignment, label1, label2): pvec = ((x if x != label1 else label2) for x in assignment.partition_vector) return Partition(tuple(pvec)) def merge_closest(self, assignment): print('Finding clusters to merge...') clusters = self.get_clusters(assignment) best_score = NEGINF merging = [None, None] for i in clusters: for j in clusters: # print('i = {}, j = {}'.format(i, j)) if i >= j: continue print('Testing Clusters {0} and {1}'.format(i, j)) test_assignment = self.merge(assignment, i, j) self.update(test_assignment) score = self.scorer.score(test_assignment, history=False) if score > best_score: merging[0] = i merging[1] = j best_score = score best_assignment = test_assignment print('Merging clusters {0} and {1}'.format(*merging)) print('Best assignment: {0}'.format(best_assignment)) return(best_assignment) def split(self, k, assignment, verbosity=1): """ Function to split cluster based on least representative alignment """ if verbosity > 1: print(assignment) members = self.get_clusters(assignment)[k] if len(members) == 1: return assignment elif len(members) == 2: new_partition_vector = list(assignment.partition_vector) new_partition_vector[members[0]] = max(assignment.partition_vector) + 1 new_assignment = Partition(new_partition_vector) return new_assignment tree = self.get_cluster_trees(assignment)[k] alignment_scores = {} if verbosity > 0: print('Calculating alignment scores...') for i in members: r = self.Collection.records[i] alignment_scores[i] = self.test(r, tree) / float(r.seqlength) # per-site likelihood seed, min_score = min(alignment_scores.iteritems(), key=operator.itemgetter(1)) print('Splitting on {0}.'.format(seed+1))# convert to 1-based indexing new_assignment = list(assignment.partition_vector) new_assignment[seed] = max(assignment.partition_vector) + 1 if verbosity > 1: print('New Partition: {0}'.format(new_assignment)) if verbosity > 0: print('Assigning to new partition...') new_assignment = Partition(new_assignment) scores = self.score_sample(members, new_assignment) assignment = self.make_new_assignment(members, scores, new_assignment, nreassign=len(members)) if verbosity > 1: print('Returning: {0}'.format(assignment)) return assignment def split_max_var(self, assignment): clusters = self.get_clusters(assignment) var_dict = {} for k in clusters.keys(): var_dict[k] = self.var(clusters[k]) print(var_dict) cluster_to_split, var = max(clusters.iteritems(), key=operator.itemgetter(1)) def split_search(self, assignment, update=True): clusters = self.get_clusters(assignment) k = len(assignment) best_score = NEGINF for i in clusters: print('i: {0}'.format(i)) test_assignment = self.split(i, assignment) # score = self.scorer.score(test_assignment) if len(test_assignment) == k + 1: score = self.scorer.score(test_assignment, history=False) self.update(test_assignment) else: score = -np.Inf print('Something has gone wrong') print(test_assignment) print(score) if score > best_score: best_score = score best_assignment = test_assignment return best_assignment def test(self, record, tree, model=None): """ TESTS AN ALIGNMENT AGAINST A TREE TOPOLOGY """ tmp_record = copy.deepcopy(record) # if tree label set and record label set don't match header_set = set(tmp_record.headers) extra_in_tree = tree.labels - header_set extra_in_record = header_set - tree.labels if extra_in_tree: for lab in extra_in_tree: tmp_record.headers.append(lab) tmp_record.sequences.append(''.join(['-']*tmp_record.seqlength)) tmp_record._update() if extra_in_record: for lab in extra_in_record: i = tmp_record.headers.index(lab) tmp_record.headers = (tmp_record.headers[:i] + tmp_record.headers[i+1:]) tmp_record.sequences = (tmp_record.sequences[:i] + tmp_record.sequences[i+1:]) tmp_record._update() return tmp_alignment.likelihood(tree, self.tmpdir, fit_rates=True) # alignment_file = tmp_record.write_phylip('{0}/tmp_alignment.phy'.format( # self.tmpdir), interleaved=True) # newick_file = tree.write_to_file('{0}/tmp_tree.nwk'.format(self.tmpdir)) # p = Phyml(tmp_record, self.tmpdir) # p.add_tempfile(alignment_file) # p.add_tempfile(newick_file) # p.add_flag('-i', alignment_file) # p.add_flag('-u', newick_file) # p.add_flag('-b', '0') # no bootstraps # if tmp_record.datatype == 'dna': # if model is None: # model = 'GTR' # p.add_flag('-m', model) # p.add_flag('-d', 'nt') # else: # if model is None: # model = 'WAG' # p.add_flag('-m', model) # evolutionary model # p.add_flag('-d', 'aa') # datatype # p.add_flag('-o', 'n') # no optimisation # return p.run().score def var(self, members): score = self.scorer.add(tuple(members)).score records = [self.Collection.records[i] for i in members] total_length = sum([r.seqlength for r in records]) return(score / total_length) def optimise(self, assignment, nclusters=None, update=True, history=True, sample_size=10, nreassign=10, max_stayed_put=25, max_resets=5, max_done_worse=5, max_iter=1000): if nclusters is None: nclusters = self.nclusters assignment = self.constrain_assignment(assignment, nclusters) local_best_assignment = assignment local_best_score = self.scorer.score(local_best_assignment, history=False) current_assignment = local_best_assignment self.sampled = [] print(self.status(current_assignment)) while True: if self.stayed_put > max_stayed_put: print('stayed put too many times ({0})'.format(max_stayed_put)) break if self.resets == max_resets: print('Reset limit reached ({0})'.format(max_resets)) break if self.done_worse == max_done_worse: print('wandered off, resetting...') self.resets += 1 self.done_worse = 0 current_assignment = local_best_assignment if self.i == max_iter: print('max iterations reached') break new_assignment = self.move(sample_size, current_assignment, nreassign) new_assignment = self.constrain_assignment(new_assignment, nclusters) score = self.scorer.score(new_assignment, history=history) self.update(new_assignment) if (score - local_best_score) > EPS: self.sampled = [] local_best_score = score local_best_assignment = new_assignment self.stayed_put = 0 self.done_worse = 0 self.resets = 0 print(self.status(new_assignment, '(Improved)')) elif np.abs(score - local_best_score) < EPS: self.stayed_put += 1 self.done_worse = 0 message = ('(No improvement - [{}/{}])'.format(self.stayed_put, max_stayed_put)) print(self.status(new_assignment, message)) else: self.sampled = [] #self.stayed_put = 0 self.done_worse += 1 message = '(Did worse - [{}/{}]'.format(self.done_worse, max_done_worse) print(self.status(new_assignment, message)) self.i += 1 self._reset_counts() return local_best_assignment def optimise_with_variable_clusters(self, assignment, target_clusters, max_clusters, optimise_on_ascent=True, optimise_on_descent=True, update=True, **kwargs): if max_clusters < target_clusters: raise ValueError('max_clusters ({}) must be at least equal to ' 'target_clusters ({})'.format(max_clusters, target_clusters)) current_clusters = len(assignment) print('Optimising current assignment with {} clusters. Optimiser will ' 'ascend to {} clusters, and descend to a target of {} clusters' '.'.format(current_clusters, max_clusters, target_clusters)) for n in range(current_clusters, max_clusters+1): print("ASCENDING (optimisation:{}) -> Current target: " "{} clusters".format(('ON' if optimise_on_ascent else 'OFF'), n)) if optimise_on_ascent: assignment = self.optimise(assignment, nclusters=n, **kwargs) else: assignment = self.constrain_assignment(assignment, n) for n in range(max_clusters-1, target_clusters-1, -1): print('DESCENDING (optimisation:{}) -> Current target: {} ' 'clusters'.format(('ON' if optimise_on_descent else 'OFF'), n)) if optimise_on_descent: assignment = self.optimise(assignment, nclusters=n, **kwargs) else: assignment = self.constrain_assignment(assignment, n) return self.constrain_assignment(assignment, target_clusters) def write(self, filename): headers = ['Iteration', 'CPU Time', 'Likelihood', 'Partition', 'NClusters'] output = [[i] + x + len(x[-1]) for (i, x) in enumerate(self.scorer.history)] with open(filename, 'w+') as file_: writer = csv.writer(file_, delimiter='\t', quoting=csv.QUOTE_NONE) writer.writerow(headers) writer.writerows(output)
def __init__(self, members, records, analysis): self.members = tuple(members) self.records = [records[i] for i in self.members] self.scorer = Scorer(records, analysis) self.tree = self.scorer.add(self.members)
class EMTrees(object): def __init__( self, collection, nclusters, metric='euc', tmpdir=None, ): if not isinstance(nclusters, int) or nclusters <= 1: raise Exception('Need appropriate value for number of clusters.') self.nclusters = nclusters self.scorer = Scorer(collection.records, collection.analysis) # Could check for entries self.datatype = collection.datatype self.metric = metric try: self.tmpdir except: self.tmpdir = collection.tmpdir def clusters_init(self): k = self.nclusters assignment = [0] * len(self.scorer.records) for i in range(k): assignment[np.random.randint(0, len(assignment))] = i + 1 partition = Partition(assignment) clusters = [0] * k members = partition.get_membership()[1:] self.assign_clusters(clusters, members) for (index, record) in enumerate(self.scorer.records): scores = [ self.ml(record, clusters[n]) for n in range(self.nclusters) ] # print scores if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 self.partition = Partition(assignment) self.L = self.scorer.score(self.partition) def random_partition(self): self.partition = Partition( tuple( np.random.randint(self.nclusters, size=len(self.scorer.records)))) self.L = self.scorer.score(self.partition) def assign_clusters(self, clusters, members): for n in range(self.nclusters): if not clusters[n] or clusters[n].members != members[n]: clusters[n] = Cluster(members[n], self.scorer.records, self.scorer.analysis) return (clusters) def maximise(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) for (index, record) in enumerate(self.scorer.records): scores = [ alg(record, clusters[n]) for n in range(self.nclusters) ] # print scores if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment else: count += 1 if count > 1: break # Algorithm is deterministic so no need for more iterations def maximise_random(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 sampled = [] while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) if index in sampled: continue else: record = self.scorer.records[index] sampled.append(index) scores = [alg(record, clusters[n]) for n in range(self.nclusters)] if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment sampled = [] count = 0 else: count += 1 if count == len(assignment): break def maximise_heuristic(self): clusters = [0] * self.nclusters sampled = [] for i in range(1000): self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) record = self.scorer.records[index] sampled.append(index) lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)] a = {'ll': max(lls)} a['n'] = lls.index(a['ll']) lls.pop(a['n']) b = {'ll': max(lls)} b['n'] = lls.index(b['ll']) a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll'])) if np.random.uniform() > a['p']: choice = a['n'] else: choice = b['n'] if assignment.count( assignment[index]) > 1 or assignment[index] == 0: assignment[index] = choice + 1 assignment = Partition(assignment) if i % 10 == 0: score = self.scorer.score(assignment) if score > self.L: self.max_L = score self.max_partition = assignment def dist(self, obj1, obj2): distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1] return (-distance) def ml(self, record, cluster, verbose=1): p = Phyml(record, tmpdir=self.tmpdir) input_tree = os.path.join(self.tmpdir, 'input_tree') cluster.tree.write_to_file(input_tree) p.add_tempfile(input_tree) p.add_flag('--inputtree', input_tree) p.add_flag('-o', 'r') # Optimise only on substitutions` p.add_flag('-a', 'e') p.add_flag('-b', 0) p.add_flag('-c', 4) p.add_flag('--quiet', '') if self.datatype == 'protein': p.add_flag('-d', 'aa') elif self.datatype == 'dna': p.add_flag('-d', 'nt') score = p.run(verbosity=verbose).score return (score)
#!/usr/bin/env python from collection import Collection, Scorer from clustering import Partition from random import randint from anneal import * import pickle c = Collection(input_dir='/homes/mgperry/treeCl_data/easy_case/', compression='gz', file_format='phylip', datatype='protein') scorer = Scorer(c.records, 'nj') k = 4 partition = [randint(1, k) for rec in scorer.records] def likelihood(partition, scorer): score = scorer.score(Partition(partition)) return(score) print type(partition) opts = {'func': likelihood, 'x0': partition, 'args': [scorer], 'schedule': 'cluster', 'full_output': 1, 'T0': 100000, 'Tf': 1, 'maxeval': None,
class Optimiser(object): def __init__(self, nclusters, collection, tmpdir=TMPDIR, analysis='nj', initial_assignment=None, scorer=None): optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection']) if self.analysis == 'tc': self.analysis = 'TreeCollection' else: self.analysis = analysis self.Collection = collection if not self.Collection.records[0].tree: print('Calculating {} trees for collection...'.format(analysis)) self.Collection.calc_NJ_trees() self.datatype = collection.datatype if scorer is not None and isinstance(scorer, Scorer): self.scorer = scorer else: self.scorer = Scorer(self.Collection) self.nclusters = nclusters self.tmpdir = tmpdir print('Calculating initial scores...') if initial_assignment is None: initial_assignment = Partition(tuple([0] * len(collection))) # initial_assignment = self.random_partition(nclusters) self.global_best_scores = {} self.global_best_assignments = {} self.global_best_scores[self.nclusters] = self.scorer.score( initial_assignment, history=True) self.global_best_assignments[self.nclusters] = initial_assignment self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 self.merges = 0 def _reset_counts(self): self.done_worse = 0 self.stayed_put = 0 self.i = 0 self.resets = 0 def status(self, current_assignment, details=None): iter_ = self.i n = len(current_assignment) curr_score = self.scorer.score(current_assignment, history=False) best_score = self.global_best_scores[n] details = ('\t' + str(details) if details is not None else '') return 'Iter:{0}\tNclusters:{1}\tCurrent\tscore:{2}\tBest score:{3}{4}'.format( iter_, n, curr_score, best_score, details) def random_partition(self, nclusters): return Partition(tuple(np.random.randint(nclusters, size=len(self.Collection)))) def update(self, assignment): """ method for working interactively and keeping nclusters correct """ nclusters = len(assignment) # len(assignment) == number of clusters best_score = self.global_best_scores.get(nclusters, NEGINF) curr_score = self.scorer.score(assignment, history=False) if (curr_score - best_score) > EPS: self.global_best_assignments[nclusters] = assignment self.global_best_scores[nclusters] = self.scorer.score(assignment, history=False) def get_cluster_trees(self, assignment, index_dict=None): index_dict = (index_dict or get_clusters(assignment)) tree_dict = {} for (k, v) in index_dict.items(): if not tuple(v) in self.scorer.concats: self.scorer.add(tuple(v)) tree_dict[k] = self.scorer.concats[tuple(v)] return tree_dict def score_sample(self, sample, assignment): """ !! changed to simply SCORE a PRE-MADE SAMPLE sample_size:int, assignment:Partition object Calculates score m*n score matrix, where m is number of alignments in the sample, and n in the number of clusters encoded in the assignment (==Partition object) """ # sample = random.sample(range(len(self.Collection)), sample_size) cluster_trees = self.get_cluster_trees(assignment) scores = np.zeros((len(sample), len(cluster_trees))) for i, record_index in enumerate(sample): rec = self.Collection.records[record_index] for j, tree in cluster_trees.items(): scores[i, j - 1] = self.test(rec, tree) return scores def constrain_assignment(self, assignment, nclusters=None): """ Constrain the assignment to have self.nclusters clusters """ if nclusters is None: nclusters = self.nclusters if (nclusters < 1) or (nclusters > len(self.Collection)): raise ValueError('Invalid number of clusters: {}'.format(nclusters)) while len(assignment.get_membership()) > nclusters: assignment = self.merge_closest(assignment) while len(assignment.get_membership()) < nclusters: assignment = self.split_search(assignment) return assignment def make_new_assignment(self, sample, scores, assignment, nreassign=1, choose='max'): """ MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS """ optioncheck(choose, ('max', 'min')) new_clusters = scores.argmax(axis=1) M = scores / scores.sum(axis=1)[:, np.newaxis] if choose == 'max': reassignments = M.max(axis=1).argsort()[-nreassign:] else: reassignments = M.min(axis=1).argsort()[:nreassign] new_assignment = list(assignment.partition_vector) for i in reassignments: new_assignment[sample[i]] = new_clusters[i] + 1 # because cluster number is in range # [1,x], and new_clusters is in range [0,x-1] return Partition(tuple(new_assignment)) def move(self, sample_size, assignment, nreassign=1, choose='max', sampled=None): """ !! now generates own sample and passes to scores wraps self.score_sample + self.new_assignment """ if sampled is None: sampled = list() unsampled = set(range(len(self.Collection))) - set(sampled) if len(unsampled) > 0: if sample_size > len(unsampled): sample = unsampled else: sample = random.sample(unsampled, sample_size) self.sampled.extend(sample) scores = self.score_sample(sample, assignment) assignment = self.make_new_assignment(sample, scores, assignment, nreassign, choose) return assignment def merge(self, assignment, label1, label2): pvec = ((x if x != label1 else label2) for x in assignment.partition_vector) return Partition(tuple(pvec)) def merge_closest(self, assignment): print('Finding clusters to merge...') clusters = get_clusters(assignment) best_score = NEGINF merging = [None, None] for i in clusters: for j in clusters: # print('i = {}, j = {}'.format(i, j)) if i >= j: continue print('Testing Clusters {0} and {1}'.format(i, j)) test_assignment = self.merge(assignment, i, j) self.update(test_assignment) score_value = self.scorer.score(test_assignment, history=False) if score_value > best_score: merging[0] = i merging[1] = j best_score = score_value best_assignment = test_assignment print('Merging clusters {0} and {1}'.format(*merging)) print('Best assignment: {0}'.format(best_assignment)) return best_assignment def split(self, k, assignment, verbosity=1): """ Function to split cluster based on least representative alignment """ if verbosity > 1: print(assignment) members = get_clusters(assignment)[k] if len(members) == 1: return assignment elif len(members) == 2: new_partition_vector = list(assignment.partition_vector) new_partition_vector[members[0]] = max(assignment.partition_vector) + 1 new_assignment = Partition(new_partition_vector) return new_assignment tree = self.get_cluster_trees(assignment)[k] alignment_scores = {} if verbosity > 0: print('Calculating alignment scores...') for i in members: r = self.Collection.records[i] alignment_scores[i] = self.test(r, tree) / float(r.seqlength) # per-site likelihood seed, min_score = min(alignment_scores.iteritems(), key=operator.itemgetter(1)) print('Splitting on {0}.'.format(seed + 1)) # convert to 1-based indexing new_assignment = list(assignment.partition_vector) new_assignment[seed] = max(assignment.partition_vector) + 1 if verbosity > 1: print('New Partition: {0}'.format(new_assignment)) if verbosity > 0: print('Assigning to new partition...') new_assignment = Partition(new_assignment) scores = self.score_sample(members, new_assignment) assignment = self.make_new_assignment(members, scores, new_assignment, nreassign=len(members)) if verbosity > 1: print('Returning: {0}'.format(assignment)) return assignment def split_max_var(self, assignment): clusters = get_clusters(assignment) var_dict = {} for k in clusters.keys(): var_dict[k] = self.var(clusters[k]) print(var_dict) cluster_to_split, var = max(clusters.iteritems(), key=operator.itemgetter(1)) def split_search(self, assignment, update=True): clusters = get_clusters(assignment) k = len(assignment) best_score = NEGINF for i in clusters: print('i: {0}'.format(i)) test_assignment = self.split(i, assignment) # score = self.scorer.score(test_assignment) if len(test_assignment) == k + 1: curr_score = self.scorer.score(test_assignment, history=False) self.update(test_assignment) else: curr_score = -np.Inf print('Something has gone wrong') print(test_assignment) print(curr_score) if curr_score > best_score: best_score = curr_score best_assignment = test_assignment return best_assignment def test(self, record, tree, model=None): """ TESTS AN ALIGNMENT AGAINST A TREE TOPOLOGY """ tmp_record = copy.deepcopy(record) # if tree label set and record label set don't match header_set = set(tmp_record.headers) extra_in_tree = tree.labels - header_set extra_in_record = header_set - tree.labels if extra_in_tree: for lab in extra_in_tree: tmp_record.headers.append(lab) tmp_record.sequences.append(''.join(['-'] * tmp_record.seqlength)) tmp_record.update() if extra_in_record: for lab in extra_in_record: i = tmp_record.headers.index(lab) tmp_record.headers = (tmp_record.headers[:i] + tmp_record.headers[i + 1:]) tmp_record.sequences = (tmp_record.sequences[:i] + tmp_record.sequences[i + 1:]) tmp_record.update() return tmp_alignment.likelihood(tree, self.tmpdir, fit_rates=True) # alignment_file = tmp_record.write_phylip('{0}/tmp_alignment.phy'.format( # self.tmpdir), interleaved=True) # newick_file = tree.write_to_file('{0}/tmp_tree.nwk'.format(self.tmpdir)) # p = Phyml(tmp_record, self.tmpdir) # p.add_tempfile(alignment_file) # p.add_tempfile(newick_file) # p.add_flag('-i', alignment_file) # p.add_flag('-u', newick_file) # p.add_flag('-b', '0') # no bootstraps # if tmp_record.datatype == 'dna': # if model is None: # model = 'GTR' # p.add_flag('-m', model) # p.add_flag('-d', 'nt') # else: # if model is None: # model = 'WAG' # p.add_flag('-m', model) # evolutionary model # p.add_flag('-d', 'aa') # datatype # p.add_flag('-o', 'n') # no optimisation # return p.run().score def var(self, members): score = self.scorer.add(tuple(members)).score records = [self.Collection.records[i] for i in members] total_length = sum([r.seqlength for r in records]) return score / total_length def optimise(self, assignment, nclusters=None, update=True, history=True, sample_size=10, nreassign=10, max_stayed_put=25, max_resets=5, max_done_worse=5, max_iter=1000): if nclusters is None: nclusters = self.nclusters assignment = self.constrain_assignment(assignment, nclusters) local_best_assignment = assignment local_best_score = self.scorer.score(local_best_assignment, history=False) current_assignment = local_best_assignment self.sampled = [] print(self.status(current_assignment)) while True: if self.stayed_put > max_stayed_put: print('stayed put too many times ({0})'.format(max_stayed_put)) break if self.resets == max_resets: print('Reset limit reached ({0})'.format(max_resets)) break if self.done_worse == max_done_worse: print('wandered off, resetting...') self.resets += 1 self.done_worse = 0 current_assignment = local_best_assignment if self.i == max_iter: print('max iterations reached') break new_assignment = self.move(sample_size, current_assignment, nreassign) new_assignment = self.constrain_assignment(new_assignment, nclusters) score = self.scorer.score(new_assignment, history=history) self.update(new_assignment) if (score - local_best_score) > EPS: self.sampled = [] local_best_score = score local_best_assignment = new_assignment self.stayed_put = 0 self.done_worse = 0 self.resets = 0 print(self.status(new_assignment, '(Improved)')) elif np.abs(score - local_best_score) < EPS: self.stayed_put += 1 self.done_worse = 0 message = ('(No improvement - [{}/{}])'.format(self.stayed_put, max_stayed_put)) print(self.status(new_assignment, message)) else: self.sampled = [] # self.stayed_put = 0 self.done_worse += 1 message = '(Did worse - [{}/{}]'.format(self.done_worse, max_done_worse) print(self.status(new_assignment, message)) self.i += 1 self._reset_counts() return local_best_assignment def optimise_with_variable_clusters(self, assignment, target_clusters, max_clusters, optimise_on_ascent=True, optimise_on_descent=True, update=True, **kwargs): if max_clusters < target_clusters: raise ValueError('max_clusters ({}) must be at least equal to ' 'target_clusters ({})'.format(max_clusters, target_clusters)) current_clusters = len(assignment) print('Optimising current assignment with {} clusters. Optimiser will ' 'ascend to {} clusters, and descend to a target of {} clusters' '.'.format(current_clusters, max_clusters, target_clusters)) for n in range(current_clusters, max_clusters + 1): print("ASCENDING (optimisation:{}) -> Current target: " "{} clusters".format(('ON' if optimise_on_ascent else 'OFF'), n)) if optimise_on_ascent: assignment = self.optimise(assignment, nclusters=n, **kwargs) else: assignment = self.constrain_assignment(assignment, n) for n in range(max_clusters - 1, target_clusters - 1, -1): print('DESCENDING (optimisation:{}) -> Current target: {} ' 'clusters'.format(('ON' if optimise_on_descent else 'OFF'), n)) if optimise_on_descent: assignment = self.optimise(assignment, nclusters=n, **kwargs) else: assignment = self.constrain_assignment(assignment, n) return self.constrain_assignment(assignment, target_clusters) def write(self, filename): headers = ['Iteration', 'CPU Time', 'Likelihood', 'Partition', 'NClusters'] output = [[i] + x + len(x[-1]) for (i, x) in enumerate(self.scorer.history)] with open(filename, 'w+') as file_: writer = csv.writer(file_, delimiter='\t', quoting=csv.QUOTE_NONE) writer.writerow(headers) writer.writerows(output)
class EMTrees(object): def __init__( self, collection, nclusters, metric='euc', tmpdir=None, ): if not isinstance(nclusters, int) or nclusters <= 1: raise Exception('Need appropriate value for number of clusters.') self.nclusters = nclusters self.scorer = Scorer(collection.records, collection.analysis) # Could check for entries self.datatype = collection.datatype self.metric = metric try: self.tmpdir except: self.tmpdir = collection.tmpdir def clusters_init(self): k = self.nclusters assignment = [0] * len(self.scorer.records) for i in range(k): assignment[np.random.randint(0, len(assignment))] = i + 1 partition = Partition(assignment) clusters = [0] * k members = partition.get_membership()[1:] self.assign_clusters(clusters, members) for (index, record) in enumerate(self.scorer.records): scores = [self.ml(record, clusters[n]) for n in range(self.nclusters)] # print scores if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 self.partition = Partition(assignment) self.L = self.scorer.score(self.partition) def random_partition(self): self.partition = Partition(tuple(np.random.randint(self.nclusters, size=len(self.scorer.records)))) self.L = self.scorer.score(self.partition) def assign_clusters(self, clusters, members): for n in range(self.nclusters): if not clusters[n] or clusters[n].members != members[n]: clusters[n] = Cluster(members[n], self.scorer.records, self.scorer.analysis) return(clusters) def maximise(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) for (index, record) in enumerate(self.scorer.records): scores = [alg(record, clusters[n]) for n in range(self.nclusters)] # print scores if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment else: count += 1 if count > 1: break # Algorithm is deterministic so no need for more iterations def maximise_random(self, method): clusters = [0] * self.nclusters alg = getattr(self, method) count = 0 sampled = [] while True: self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) if index in sampled: continue else: record = self.scorer.records[index] sampled.append(index) scores = [alg(record, clusters[n]) for n in range(self.nclusters)] if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = scores.index(max(scores)) + 1 assignment = Partition(assignment) score = self.scorer.score(assignment) if score > self.L: self.L = score self.partition = assignment sampled = [] count = 0 else: count += 1 if count == len(assignment): break def maximise_heuristic(self): clusters = [0] * self.nclusters sampled = [] for i in range(1000): self.assign_clusters(clusters, self.partition.get_membership()) assignment = list(self.partition.partition_vector) index = randint(0, len(self.scorer.records) - 1) record = self.scorer.records[index] sampled.append(index) lls = [self.ml(record, clusters[n]) for n in range(self.nclusters)] a = {'ll': max(lls)} a['n'] = lls.index(a['ll']) lls.pop(a['n']) b = {'ll': max(lls)} b['n'] = lls.index(b['ll']) a['p'] = np.maths.exp(a['ll'] - logsum(a['ll'], b['ll'])) if np.random.uniform() > a['p']: choice = a['n'] else: choice = b['n'] if assignment.count(assignment[index]) > 1 or assignment[index] == 0: assignment[index] = choice + 1 assignment = Partition(assignment) if i % 10 == 0: score = self.scorer.score(assignment) if score > self.L: self.max_L = score self.max_partition = assignment def dist(self, obj1, obj2): distance = DistanceMatrix([obj1.tree, obj2.tree], self.metric)[0][1] return(-distance) def ml(self, record, cluster, verbose=1): p = Phyml(record, tmpdir=self.tmpdir) input_tree = os.path.join(self.tmpdir, 'input_tree') cluster.tree.write_to_file(input_tree) p.add_tempfile(input_tree) p.add_flag('--inputtree', input_tree) p.add_flag('-o', 'r') # Optimise only on substitutions` p.add_flag('-a', 'e') p.add_flag('-b', 0) p.add_flag('-c', 4) p.add_flag('--quiet', '') if self.datatype == 'protein': p.add_flag('-d', 'aa') elif self.datatype == 'dna': p.add_flag('-d', 'nt') score = p.run(verbosity=verbose).score return(score)