def pssms_for_K_mer(self, K_mer): """ Return 4 pssms for gapped/ungapped and complementary/uncomplementary strands. @return: pssm, comp_pssm, gap_pssm, gap_comp_pssm """ dist = self.nucleo_dist_from_K_mer(K_mer, include_gap=False) gap_dist = self.nucleo_dist_from_K_mer(K_mer, include_gap=True) pssm = hmm.calculate_log_scores(dist) comp_pssm = hmm.calculate_complementary_scores(pssm) gap_pssm = hmm.calculate_log_scores(gap_dist) gap_comp_pssm = hmm.calculate_complementary_scores(gap_pssm) return pssm, comp_pssm, gap_pssm, gap_comp_pssm
def evaluate_gap_position( L_mer, gap_index, sequences, bg_L_mer_scores, pssm_scores, comp_pssm_scores, options ): "Evaluate a k-mer with a gap in a particular position." gap_dist = nucleo_dist_from_mer(L_mer, options.pseudo_count_for_L_mer_scoring, gap_index=gap_index) gap_pssm = hmm.calculate_log_scores(gap_dist) gap_comp_pssm = hmm.calculate_complementary_scores(gap_pssm) gap_pssm_scores = hmm.max_scores_in_sequences(gap_pssm, sequences, bg_L_mer_scores) gap_comp_pssm_scores = hmm.max_scores_in_sequences(gap_comp_pssm, sequences, bg_L_mer_scores) max_scores_per_pssm = numpy.array( [ pssm_scores, comp_pssm_scores, gap_pssm_scores, gap_comp_pssm_scores ] ) best_scores = max_scores_per_pssm.max(axis=0) best_scores[best_scores<0.] = 0. # when we didn't find sites, ignore score = best_scores.sum() / len(L_mer) / len(best_scores) logging.debug( 'Evaluated: %s; gap: %d; score: %f', numpy_to_seq(L_mer), gap_index, score ) return score
def evaluate_L_mer(L_mer, sequences, bg_L_mer_scores, gap_positions, options): dist = nucleo_dist_from_mer( L_mer, options.pseudo_count_for_L_mer_scoring, gap_index=None ) pssm = hmm.calculate_log_scores(dist) comp_pssm = hmm.calculate_complementary_scores(pssm) pssm_scores = hmm.max_scores_in_sequences(pssm, sequences, bg_L_mer_scores) comp_pssm_scores = hmm.max_scores_in_sequences(comp_pssm, sequences, bg_L_mer_scores) return max( ( evaluate_gap_position( L_mer, gap_index, sequences, bg_L_mer_scores, pssm_scores, comp_pssm_scores, options ), gap_index ) for gap_index in gap_positions )
nucleo_dists = N.array([ [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], ]) logging.info('Calculating log scores') pssm_scores = hmm.calculate_log_scores(nucleo_dists) logging.info('Calculating complementary scores') comp_scores = hmm.calculate_complementary_scores(pssm_scores) logging.info('Preprocessing sequence') seq = hmm.preprocess_sequence( N.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4])) logging.info('Scoring sequence') logging.info(hmm.score_sequence(pssm_scores, seq)) logging.info('Scoring sequence with complementary scores') logging.info(hmm.score_sequence(comp_scores, seq)) logging.info(hmm.max_score_in_sequence(pssm_scores, seq)) logging.info(hmm.max_score_in_sequence(comp_scores, seq)) long_seqs = [R.random_integers(0, 4, size=10000) for i in xrange(100)]
[.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], ] ) logging.info('Calculating log scores') pssm_scores = hmm.calculate_log_scores(nucleo_dists) logging.info('Calculating complementary scores') comp_scores = hmm.calculate_complementary_scores(pssm_scores) logging.info('Preprocessing sequence') seq = hmm.preprocess_sequence(N.array([0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,4])) logging.info('Scoring sequence') logging.info(hmm.score_sequence(pssm_scores, seq)) logging.info('Scoring sequence with complementary scores') logging.info(hmm.score_sequence(comp_scores, seq)) logging.info(hmm.max_score_in_sequence(pssm_scores, seq)) logging.info(hmm.max_score_in_sequence(comp_scores, seq)) long_seqs = [R.random_integers(0,4,size=10000) for i in xrange(100)] long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs)