def __call__(self, sequences): """ Run the motif finding algorithm. """ preprocessed_sequences = hmm.preprocess_sequences(sequences) # how big are the sequences num_bases = sum(len(s) for s in sequences) # find all K-mers collapsed with their reverse complements logging.info("Finding all %d-mers in sequences", self.init_K_mer_length) start = time.time() nmer_counts = hmm.ReverseComplementCollapsingCounter(self.init_K_mer_length) hmm.count_mers(sequences, n=self.init_K_mer_length, callback=nmer_counts) logging.info("Took %f seconds to find %d-mers", time.time() - start, self.init_K_mer_length) p_binding_site = (self.expected_sites_per_sequence * len(sequences)) / num_bases logging.info("Found %d %d-mers", nmer_counts.num_counts(), self.init_K_mer_length) start = time.time() best_starting_point = max(self.yield_evaluations(nmer_counts, preprocessed_sequences), key=lambda x: x[1]) logging.info("Evaluation took %f seconds", time.time() - start) logging.info("Best starting point: %s: %f" % best_starting_point) model = self.model_for_initialisation_K_mer(best_starting_point[0], p_binding_site) logging.info("Running Baum-Welch") start = time.time() LL, num_iterations = model.baum_welch(preprocessed_sequences) logging.info("Baum-Welch took %f seconds", time.time() - start) logging.info("Achieved LL: %f in %d iterations", LL, num_iterations) return model
logging.info('Calculating complementary scores') comp_scores = hmm.calculate_complementary_scores(pssm_scores) logging.info('Preprocessing sequence') seq = hmm.preprocess_sequence( N.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4])) logging.info('Scoring sequence') logging.info(hmm.score_sequence(pssm_scores, seq)) logging.info('Scoring sequence with complementary scores') logging.info(hmm.score_sequence(comp_scores, seq)) logging.info(hmm.max_score_in_sequence(pssm_scores, seq)) logging.info(hmm.max_score_in_sequence(comp_scores, seq)) long_seqs = [R.random_integers(0, 4, size=10000) for i in xrange(100)] long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs) logging.info('Starting to time max scores on long sequences.') start = time.time() max_scores = hmm.max_scores_in_sequences(pssm_scores, long_seqs_preprocessed) max_comp_scores = hmm.max_scores_in_sequences(comp_scores, long_seqs_preprocessed) logging.info( 'Max scores (and complementary scores) over sequences took %f secs' % (time.time() - start)) long_seq = R.random_integers(0, 4, size=1000000) long_seq_preprocessed = hmm.preprocess_sequence(long_seq) logging.info('Starting to time max scores on long sequence.')
pssm_scores = hmm.calculate_log_scores(nucleo_dists) logging.info('Calculating complementary scores') comp_scores = hmm.calculate_complementary_scores(pssm_scores) logging.info('Preprocessing sequence') seq = hmm.preprocess_sequence(N.array([0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,4])) logging.info('Scoring sequence') logging.info(hmm.score_sequence(pssm_scores, seq)) logging.info('Scoring sequence with complementary scores') logging.info(hmm.score_sequence(comp_scores, seq)) logging.info(hmm.max_score_in_sequence(pssm_scores, seq)) logging.info(hmm.max_score_in_sequence(comp_scores, seq)) long_seqs = [R.random_integers(0,4,size=10000) for i in xrange(100)] long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs) logging.info('Starting to time max scores on long sequences.') start = time.time() max_scores = hmm.max_scores_in_sequences(pssm_scores, long_seqs_preprocessed) max_comp_scores = hmm.max_scores_in_sequences(comp_scores, long_seqs_preprocessed) logging.info('Max scores (and complementary scores) over sequences took %f secs' % (time.time()-start)) long_seq = R.random_integers(0,4,size=1000000) long_seq_preprocessed = hmm.preprocess_sequence(long_seq) logging.info('Starting to time max scores on long sequence.') start = time.time() logging.info(hmm.max_score_in_sequence(pssm_scores, long_seq_preprocessed)) logging.info(hmm.max_score_in_sequence(comp_scores, long_seq_preprocessed)) logging.info('Max scores (and complementary scores) for %d bases took %f secs' % (len(long_seq), time.time()-start))