Exemple #1
0
def calculate_n_mer_significances(seqs, n, background=None):
    '''
    Counts all n-mers in the sequences and assesses the significance of each
    count w.r.t. the background_model

    If the background model is not specified, a uniform distribution over
    the bases is assumed
    '''
    from sys import getrecursionlimit
    all = _AllNMers()
    hmm.count_mers(seqs, n, all)
    collapsed = collapse_rev_comps(all.n_mers)
    log_fact = _LogFactorial()
    total_counts = sum(count for mer, count in collapsed)
    for i in xrange(1,total_counts,getrecursionlimit()/2):
        log_fact[i]
    log_fact_total = log_fact[total_counts]
    if None == background:
        background_LL = n * math.log(.25)
        foreground_LL = math.log(1.0 - math.exp(background_LL))
    result = []
    for mer, count in collapsed:
        if None != background:
            background_LL = background.LL(mer)
            foreground_LL = math.log(1.0 - math.exp(background_LL))
        log_bernoulli = (
                log_fact_total
                - log_fact[count]
                - log_fact[total_counts-count]
                + count * background_LL
                + (total_counts-count) * foreground_LL
        )
        result.append((mer, count, log_bernoulli))
    result.sort(cmp=lambda x,y: cmp(x[2], y[2]))
    return result
Exemple #2
0
    def __call__(self, sequences):
        """
        Run the motif finding algorithm.
        """

        preprocessed_sequences = hmm.preprocess_sequences(sequences)

        # how big are the sequences
        num_bases = sum(len(s) for s in sequences)

        # find all K-mers collapsed with their reverse complements
        logging.info("Finding all %d-mers in sequences", self.init_K_mer_length)
        start = time.time()
        nmer_counts = hmm.ReverseComplementCollapsingCounter(self.init_K_mer_length)
        hmm.count_mers(sequences, n=self.init_K_mer_length, callback=nmer_counts)
        logging.info("Took %f seconds to find %d-mers", time.time() - start, self.init_K_mer_length)

        p_binding_site = (self.expected_sites_per_sequence * len(sequences)) / num_bases
        logging.info("Found %d %d-mers", nmer_counts.num_counts(), self.init_K_mer_length)
        start = time.time()
        best_starting_point = max(self.yield_evaluations(nmer_counts, preprocessed_sequences), key=lambda x: x[1])
        logging.info("Evaluation took %f seconds", time.time() - start)
        logging.info("Best starting point: %s: %f" % best_starting_point)

        model = self.model_for_initialisation_K_mer(best_starting_point[0], p_binding_site)
        logging.info("Running Baum-Welch")
        start = time.time()
        LL, num_iterations = model.baum_welch(preprocessed_sequences)
        logging.info("Baum-Welch took %f seconds", time.time() - start)
        logging.info("Achieved LL: %f in %d iterations", LL, num_iterations)

        return model
def yield_k_mers(sequences, K):
    """
    @return: Yield the (K-mer, count) in order such that the mers with highest number of occurences come first.
    """
    from hmm import ReverseComplementCollapsingCounter, count_mers
    from heapq import heapify, heappop
    import time

    # find all K-mers collapsed with their reverse complements
    logging.info('Finding all %d-mers in sequences', K)
    start = time.time()
    nmer_counts = ReverseComplementCollapsingCounter(K)
    count_mers(sequences, n=K, callback=nmer_counts)
    logging.info('Took %f seconds to find %d-mers', time.time()-start, K)


    start = time.time()
    counts = list((-count, i, K_mer) for i, (K_mer, count) in enumerate(nmer_counts.counts()))
    heapify(counts)
    logging.info('Took %f seconds to heapify', time.time()-start)
    #import IPython; IPython.Debugger.Pdb().set_trace()

    while counts:
        count, i, K_mer = heappop(counts)
        yield K_mer, -count
Exemple #4
0
def top_k_n_mers(seqs, n, k):
    all = _AllNMers()
    hmm.count_mers(seqs, n, all)
    import heapq
    return heapq.nlargest(k, all.n_mers, key=lambda x: x[1])
Exemple #5
0
def most_common_n_mer(seqs, n):
    callback = _MostCommonNMer()
    hmm.count_mers(seqs, n, callback)
    return callback.best_mer, callback.best_count
Exemple #6
0
            numpy.array([0,1,2])
    ]
    print most_common_n_mer(seqs, 3)
    print most_common_n_mer(seqs, 5)

    print
    c=hmm.MarkovOrderConverter(4,2)
    a=numpy.array([0,1,2])
    order_n_obs = c.convert_to_order_n_observation(a)
    a_copy = c.convert_from_order_n_observation(order_n_obs)
    print a
    print a_copy
    print a.all() == a_copy.all()

    print
    seqs = [
            numpy.array([3,2,1]),
            numpy.array([0,1,2]),
            numpy.array([0,1,2]),
            numpy.array([1,2,3]),
    ]
    n = 3
    all = _AllNMers()
    hmm.count_mers(seqs, n, all)
    for mer, count in collapse_rev_comps(all.n_mers):
        print mer, count

    print
    significant_mers = calculate_n_mer_significances(seqs, 3, None)
    print significant_mers[:10]
Exemple #7
0
samples = [sample_from(model, N, L) for model in models]
print "Binding bases per model:", [sum(sum(s[0] != 0) for s in sample) for sample in samples]
print "Sites per sample:", [
    sum(len([site for site in yield_sites_in_states(s[0], (0,))]) for s in sample) for sample in samples
]


print "Converting sequences"
sequence_sets = [[hmm.pssm.numpy_to_seq(s[1]) for s in sample] for sample in samples]
print "Writing sequences"
for i, sequences in enumerate(sequence_sets):
    f = open("synthetic-2/synthetic-sequences-%s.fa" % tag(i), "w")
    for j, s in enumerate(sequences):
        f.write("> sequence %d\n" % j)
        f.write(s)
        f.write("\n")
    f.close()


if False:
    for i, sample in enumerate(samples):
        print "10 largest counts in sequence set: %d" % i
        nmer_counts = hmm.ReverseComplementCollapsingCounter(K)
        hmm.count_mers([sequence[1].astype(int) for sequence in sample], n=K, callback=nmer_counts)
        import heapq

        print "\n".join(
            "%s : %d" % (hmm.pssm.numpy_to_seq(nmer), count)
            for nmer, count in heapq.nlargest(10, nmer_counts.counts(), key=lambda count: count[1])
        )