def calculate_n_mer_significances(seqs, n, background=None): ''' Counts all n-mers in the sequences and assesses the significance of each count w.r.t. the background_model If the background model is not specified, a uniform distribution over the bases is assumed ''' from sys import getrecursionlimit all = _AllNMers() hmm.count_mers(seqs, n, all) collapsed = collapse_rev_comps(all.n_mers) log_fact = _LogFactorial() total_counts = sum(count for mer, count in collapsed) for i in xrange(1,total_counts,getrecursionlimit()/2): log_fact[i] log_fact_total = log_fact[total_counts] if None == background: background_LL = n * math.log(.25) foreground_LL = math.log(1.0 - math.exp(background_LL)) result = [] for mer, count in collapsed: if None != background: background_LL = background.LL(mer) foreground_LL = math.log(1.0 - math.exp(background_LL)) log_bernoulli = ( log_fact_total - log_fact[count] - log_fact[total_counts-count] + count * background_LL + (total_counts-count) * foreground_LL ) result.append((mer, count, log_bernoulli)) result.sort(cmp=lambda x,y: cmp(x[2], y[2])) return result
def __call__(self, sequences): """ Run the motif finding algorithm. """ preprocessed_sequences = hmm.preprocess_sequences(sequences) # how big are the sequences num_bases = sum(len(s) for s in sequences) # find all K-mers collapsed with their reverse complements logging.info("Finding all %d-mers in sequences", self.init_K_mer_length) start = time.time() nmer_counts = hmm.ReverseComplementCollapsingCounter(self.init_K_mer_length) hmm.count_mers(sequences, n=self.init_K_mer_length, callback=nmer_counts) logging.info("Took %f seconds to find %d-mers", time.time() - start, self.init_K_mer_length) p_binding_site = (self.expected_sites_per_sequence * len(sequences)) / num_bases logging.info("Found %d %d-mers", nmer_counts.num_counts(), self.init_K_mer_length) start = time.time() best_starting_point = max(self.yield_evaluations(nmer_counts, preprocessed_sequences), key=lambda x: x[1]) logging.info("Evaluation took %f seconds", time.time() - start) logging.info("Best starting point: %s: %f" % best_starting_point) model = self.model_for_initialisation_K_mer(best_starting_point[0], p_binding_site) logging.info("Running Baum-Welch") start = time.time() LL, num_iterations = model.baum_welch(preprocessed_sequences) logging.info("Baum-Welch took %f seconds", time.time() - start) logging.info("Achieved LL: %f in %d iterations", LL, num_iterations) return model
def yield_k_mers(sequences, K): """ @return: Yield the (K-mer, count) in order such that the mers with highest number of occurences come first. """ from hmm import ReverseComplementCollapsingCounter, count_mers from heapq import heapify, heappop import time # find all K-mers collapsed with their reverse complements logging.info('Finding all %d-mers in sequences', K) start = time.time() nmer_counts = ReverseComplementCollapsingCounter(K) count_mers(sequences, n=K, callback=nmer_counts) logging.info('Took %f seconds to find %d-mers', time.time()-start, K) start = time.time() counts = list((-count, i, K_mer) for i, (K_mer, count) in enumerate(nmer_counts.counts())) heapify(counts) logging.info('Took %f seconds to heapify', time.time()-start) #import IPython; IPython.Debugger.Pdb().set_trace() while counts: count, i, K_mer = heappop(counts) yield K_mer, -count
def top_k_n_mers(seqs, n, k): all = _AllNMers() hmm.count_mers(seqs, n, all) import heapq return heapq.nlargest(k, all.n_mers, key=lambda x: x[1])
def most_common_n_mer(seqs, n): callback = _MostCommonNMer() hmm.count_mers(seqs, n, callback) return callback.best_mer, callback.best_count
numpy.array([0,1,2]) ] print most_common_n_mer(seqs, 3) print most_common_n_mer(seqs, 5) print c=hmm.MarkovOrderConverter(4,2) a=numpy.array([0,1,2]) order_n_obs = c.convert_to_order_n_observation(a) a_copy = c.convert_from_order_n_observation(order_n_obs) print a print a_copy print a.all() == a_copy.all() print seqs = [ numpy.array([3,2,1]), numpy.array([0,1,2]), numpy.array([0,1,2]), numpy.array([1,2,3]), ] n = 3 all = _AllNMers() hmm.count_mers(seqs, n, all) for mer, count in collapse_rev_comps(all.n_mers): print mer, count print significant_mers = calculate_n_mer_significances(seqs, 3, None) print significant_mers[:10]
samples = [sample_from(model, N, L) for model in models] print "Binding bases per model:", [sum(sum(s[0] != 0) for s in sample) for sample in samples] print "Sites per sample:", [ sum(len([site for site in yield_sites_in_states(s[0], (0,))]) for s in sample) for sample in samples ] print "Converting sequences" sequence_sets = [[hmm.pssm.numpy_to_seq(s[1]) for s in sample] for sample in samples] print "Writing sequences" for i, sequences in enumerate(sequence_sets): f = open("synthetic-2/synthetic-sequences-%s.fa" % tag(i), "w") for j, s in enumerate(sequences): f.write("> sequence %d\n" % j) f.write(s) f.write("\n") f.close() if False: for i, sample in enumerate(samples): print "10 largest counts in sequence set: %d" % i nmer_counts = hmm.ReverseComplementCollapsingCounter(K) hmm.count_mers([sequence[1].astype(int) for sequence in sample], n=K, callback=nmer_counts) import heapq print "\n".join( "%s : %d" % (hmm.pssm.numpy_to_seq(nmer), count) for nmer, count in heapq.nlargest(10, nmer_counts.counts(), key=lambda count: count[1]) )