Example #1
0
 def __call__(self, k_mer):
     """
     @return: True if and only if the K-mer passes the filter.
     """
     if not self.passed:
         result = True
     else:
         d = self._distance_to_passed(k_mer)
         result = d >= self.min_distance
         if result:
             logging.debug('%s is %d away from previous k-mers', numpy_to_seq(k_mer), d)
     if result == True:
         self.passed.add(numpy_to_seq(k_mer))
     return result
def evaluate_gap_position(
  L_mer,
  gap_index,
  sequences,
  bg_L_mer_scores,
  pssm_scores,
  comp_pssm_scores,
  options
):
    "Evaluate a k-mer with a gap in a particular position."
    gap_dist = nucleo_dist_from_mer(L_mer, options.pseudo_count_for_L_mer_scoring, gap_index=gap_index)
    gap_pssm = hmm.calculate_log_scores(gap_dist)
    gap_comp_pssm = hmm.calculate_complementary_scores(gap_pssm)
    gap_pssm_scores = hmm.max_scores_in_sequences(gap_pssm, sequences, bg_L_mer_scores)
    gap_comp_pssm_scores = hmm.max_scores_in_sequences(gap_comp_pssm, sequences, bg_L_mer_scores)
    max_scores_per_pssm = numpy.array(
        [
            pssm_scores,
            comp_pssm_scores,
            gap_pssm_scores,
            gap_comp_pssm_scores
        ]
    )
    best_scores = max_scores_per_pssm.max(axis=0)
    best_scores[best_scores<0.] = 0. # when we didn't find sites, ignore
    score = best_scores.sum() / len(L_mer) / len(best_scores)
    logging.debug(
        'Evaluated: %s; gap: %d; score: %f',
        numpy_to_seq(L_mer),
        gap_index,
        score
    )
    return score
def generate_seeds(
  sequences,
  preprocessed_sequences,
  options
):
    """
    Generate a list of candidate L-mers and score them to find the best seed L-mer and gap position.
    """
    # if we have been given a background model filename and it exists then load it.
    if None != options.bg_model_filename and os.path.exists(options.bg_model_filename):
        logging.info("Loading supplied background model from %s", options.bg_model_filename)
        bg_model = cPickle.load(open(options.bg_model_filename))
        converted_seqs = [bg_model.converter.to_order_n(s) for s in sequences]
    else:
        logging.info("Learning new background model")
        bg_model, converted_seqs = learn_bg_model(
          sequences,
          num_mosaics=options.bg_model_num_mosaics,
          order=options.bg_model_order
        )
        if options.bg_model_filename:
            logging.info("Saving background model to %s", options.bg_model_filename)
            cPickle.dump(bg_model, open(options.bg_model_filename, 'w'))

    if options.force_seed:
        logging.info('Forcing seed to be: %s', options.force_seed)
        L_mers = [(seq_to_numpy(options.force_seed), len(sequences), len(sequences))]
    else:
        # Calculate log likelihood of L-mers under background model.
        bg_L_mer_scores = calculate_k_mer_scores(bg_model, converted_seqs, options.L)

        # Find best candidate L-mers
        distance = K_mer_distance(allowed_shifts=options.allowed_shifts, shift_cost=options.shift_cost)
        L_mer_seeds = list()
        gap_end_offset = options.L/5 + 1
        start = time.time()
        num_L_mers_to_find = 3 * options.max_L_mers_to_evaluate
        logging.info('Finding best %d candidate %d-mers to seed HMM emissions', num_L_mers_to_find, options.L)
        L_mers = hmm.top_mers_by_sequence_membership(
          preprocessed_sequences,
          k=options.L,
          n=num_L_mers_to_find
        )
        logging.info('Finding top %d %d-mers took %f seconds', len(L_mers), options.L, time.time()-start)

    if options.force_gap:
        logging.info('Forcing gap at position: %d', options.force_gap)
        L_mer_seeds = [
          (numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, options.force_gap, 0.0)
          for L_mer, L_mer_count, L_mer_num_seqs in L_mers
        ]
    else:
        # Evaluate L-mers
        if -1 == options.seed_filter_distance:
            min_distance = options.L / 4 + 1
        else:
            min_distance = options.seed_filter_distance
        logging.info('Positioning gaps up to %d bases from end of K-mers', gap_end_offset)
        gap_positions = range(gap_end_offset, options.L+1-gap_end_offset)
        logging.info('Filtering K-mers that are not %d away from previously evaluated.', min_distance)
        logging.info('Evaluating up to %d L-mers.', options.max_L_mers_to_evaluate)
        L_mer_filter = DistanceFilter(distance, min_distance=min_distance)
        discarded = 0
        evaluated = 0
        for L_mer, L_mer_count, L_mer_num_seqs in L_mers:
            if not L_mer_filter(L_mer) or 4 in L_mer:
                logging.debug('Discarding: %s; count: %d; # sequences: %d', numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs)
                discarded += 1
            else:
                score, gap_index = evaluate_L_mer(L_mer, sequences, bg_L_mer_scores, gap_positions, options)
                evaluated += 1
                logging.info(
                    'Evaluated (%3d/%d): %s; gap: %d; count: %d; # sequences: %d; score: %f',
                    evaluated, options.max_L_mers_to_evaluate, numpy_to_seq(L_mer), gap_index, L_mer_count, L_mer_num_seqs, score
                )
                L_mer_seeds.append((numpy_to_seq(L_mer), L_mer_count, L_mer_num_seqs, gap_index, score))
                if len(L_mer_seeds) == options.max_L_mers_to_evaluate:
                    break
        L_mer_seeds.sort(key=lambda x: -x[4]) # sort by score, highest first
        logging.info('Discarded %d L-mers using edit distance', discarded)
        logging.info('Evaluated %d L-mers: scores range from %f to %f', evaluated, L_mer_seeds[-1][4], L_mer_seeds[0][4])
    return L_mer_seeds, bg_model