def __call__(self, k_mer): """ @return: True if and only if the K-mer passes the filter. """ if not self.passed: result = True else: d = self._distance_to_passed(k_mer) result = d >= self.min_distance if result: logging.debug('%s is %d away from previous k-mers', numpy_to_seq(k_mer), d) if result == True: self.passed.add(numpy_to_seq(k_mer)) return result
def evaluate_gap_position( L_mer, gap_index, sequences, bg_L_mer_scores, pssm_scores, comp_pssm_scores, options ): "Evaluate a k-mer with a gap in a particular position." gap_dist = nucleo_dist_from_mer(L_mer, options.pseudo_count_for_L_mer_scoring, gap_index=gap_index) gap_pssm = hmm.calculate_log_scores(gap_dist) gap_comp_pssm = hmm.calculate_complementary_scores(gap_pssm) gap_pssm_scores = hmm.max_scores_in_sequences(gap_pssm, sequences, bg_L_mer_scores) gap_comp_pssm_scores = hmm.max_scores_in_sequences(gap_comp_pssm, sequences, bg_L_mer_scores) max_scores_per_pssm = numpy.array( [ pssm_scores, comp_pssm_scores, gap_pssm_scores, gap_comp_pssm_scores ] ) best_scores = max_scores_per_pssm.max(axis=0) best_scores[best_scores<0.] = 0. # when we didn't find sites, ignore score = best_scores.sum() / len(L_mer) / len(best_scores) logging.debug( 'Evaluated: %s; gap: %d; score: %f', numpy_to_seq(L_mer), gap_index, score ) return score
def generate_seeds( sequences, preprocessed_sequences, options ): """ Generate a list of candidate L-mers and score them to find the best seed L-mer and gap position. """ # if we have been given a background model filename and it exists then load it. if None != options.bg_model_filename and os.path.exists(options.bg_model_filename): logging.info("Loading supplied background model from %s", options.bg_model_filename) bg_model = cPickle.load(open(options.bg_model_filename)) converted_seqs = [bg_model.converter.to_order_n(s) for s in sequences] else: logging.info("Learning new background model") bg_model, converted_seqs = learn_bg_model( sequences, num_mosaics=options.bg_model_num_mosaics, order=options.bg_model_order ) if options.bg_model_filename: logging.info("Saving background model to %s", options.bg_model_filename) cPickle.dump(bg_model, open(options.bg_model_filename, 'w')) if options.force_seed: logging.info('Forcing seed to be: %s', options.force_seed) L_mers = [(seq_to_numpy(options.force_seed), len(sequences), len(sequences))] else: # Calculate log likelihood of L-mers under background model. bg_L_mer_scores = calculate_k_mer_scores(bg_model, converted_seqs, options.L) # Find best candidate L-mers distance = K_mer_distance(allowed_shifts=options.allowed_shifts, shift_cost=options.shift_cost) L_mer_seeds = list() gap_end_offset = options.L/5 + 1 start = time.time() num_L_mers_to_find = 3 * options.max_L_mers_to_evaluate logging.info('Finding best %d candidate %d-mers to seed HMM emissions', num_L_mers_to_find, options.L) L_mers = hmm.top_mers_by_sequence_membership( preprocessed_sequences, k=options.L, n=num_L_mers_to_find ) logging.info('Finding top %d %d-mers took %f seconds', len(L_mers), options.L, time.time()-start) if options.force_gap: logging.info('Forcing gap at position: %d', options.force_gap) L_mer_seeds = [ (numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, options.force_gap, 0.0) for L_mer, L_mer_count, L_mer_num_seqs in L_mers ] else: # Evaluate L-mers if -1 == options.seed_filter_distance: min_distance = options.L / 4 + 1 else: min_distance = options.seed_filter_distance logging.info('Positioning gaps up to %d bases from end of K-mers', gap_end_offset) gap_positions = range(gap_end_offset, options.L+1-gap_end_offset) logging.info('Filtering K-mers that are not %d away from previously evaluated.', min_distance) logging.info('Evaluating up to %d L-mers.', options.max_L_mers_to_evaluate) L_mer_filter = DistanceFilter(distance, min_distance=min_distance) discarded = 0 evaluated = 0 for L_mer, L_mer_count, L_mer_num_seqs in L_mers: if not L_mer_filter(L_mer) or 4 in L_mer: logging.debug('Discarding: %s; count: %d; # sequences: %d', numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs) discarded += 1 else: score, gap_index = evaluate_L_mer(L_mer, sequences, bg_L_mer_scores, gap_positions, options) evaluated += 1 logging.info( 'Evaluated (%3d/%d): %s; gap: %d; count: %d; # sequences: %d; score: %f', evaluated, options.max_L_mers_to_evaluate, numpy_to_seq(L_mer), gap_index, L_mer_count, L_mer_num_seqs, score ) L_mer_seeds.append((numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, gap_index, score)) if len(L_mer_seeds) == options.max_L_mers_to_evaluate: break L_mer_seeds.sort(key=lambda x: -x[4]) # sort by score, highest first logging.info('Discarded %d L-mers using edit distance', discarded) logging.info('Evaluated %d L-mers: scores range from %f to %f', evaluated, L_mer_seeds[-1][4], L_mer_seeds[0][4]) return L_mer_seeds, bg_model