Esempio n. 1
0
def find_best_w_mers_for_seed(stem_algorithm, seed, options, log_level):
    "Find the best W-mers for the seed."
    num_to_find = options.max_num_sites
    logging.log(
        log_level, 'Finding best %d W-mers for seed %s', num_to_find, seed)
    _freqs, _freqs_with_pseudo_counts, model = stempy.Model(
        stem_algorithm.data,
        len(seed),
        options
    )
    logging.debug('Seed pseudo-count: %f', model.bs.seed_pseudo_counts)
    logging.debug('Model lambda: %f', model.lambda_)
    model.bs.seed(seed, True)
    best_w_mer_finder = stempy.create_best_w_mer_finder(
        stem_algorithm.data, model, num_to_find)
    best_w_mer_finder()
    best_w_mers = list(best_w_mer_finder.best_w_mers)[-num_to_find:]
    avg_Z = 0.
    avg_distance = 0.
    for eval_ in best_w_mers:
        w_mer = stem_algorithm.data.get_W_mer(len(seed), eval_.global_pos)
        if eval_.rev_comp:
            w_mer = reverse_complement(w_mer)
        distance = hamming_distance(seed, w_mer)
        logging.log(log_level, 'Best W-mer: %s; Z=%.5f, distance=%d',
                    w_mer, eval_.Z, distance)
        avg_Z += eval_.Z
        avg_distance += distance
    best_w_mer_finder.update_model(num_to_find, use_pseudo_counts=False)
    log_pop = model.log_product_of_pvalues
    avg_Z /= len(best_w_mers)
    logging.log(log_level, 'Seed: %s; log PoP: %.6f', seed, log_pop)
    logging.log(log_level, 'Seed: %s; Average Z=%.7f', seed, avg_Z)
    logging.log(log_level, 'Seed: %s; Average distance=%.2f',
                seed, avg_distance / len(best_w_mers))
    logging.debug('Frequencies: %s' %
                  ' '.join(map(str, (model.bg.freqs.freq(x) for x in xrange(4)))))
    return avg_Z, log_pop
Esempio n. 2
0
instance_finder.instances.sort()
# at least 13 instances in sequences
#2012-06-16 11:32:58,686 - INFO - seq=    5; pos=    67; strand=+; W-mer=AACCTCGAGAG; Z=0.749857
#2012-06-16 11:32:58,686 - INFO - seq=    0; pos=    48; strand=+; W-mer=AACCTAAGAAA; Z=0.814953
#2012-06-16 11:32:58,686 - INFO - seq=    3; pos=    51; strand=+; W-mer=AAACTGTGGCT; Z=0.819370
#2012-06-16 11:32:58,686 - INFO - seq=    5; pos=    79; strand=+; W-mer=AAGCTAAAGAG; Z=0.827948
#2012-06-16 11:32:58,687 - INFO - seq=    3; pos=    36; strand=-; W-mer=AAGCTTATCAG; Z=0.862206
#2012-06-16 11:32:58,687 - INFO - seq=    5; pos=    97; strand=-; W-mer=GAACTGGGGAT; Z=0.912242
#2012-06-16 11:32:58,687 - INFO - seq=    2; pos=    47; strand=+; W-mer=AAACTTGGGAA; Z=0.919355
#2012-06-16 11:32:58,687 - INFO - seq=    1; pos=     6; strand=+; W-mer=AACCTTAGACG; Z=0.963969
#2012-06-16 11:32:58,687 - INFO - seq=    6; pos=    46; strand=-; W-mer=AAGCTGGGGAC; Z=0.968255
#2012-06-16 11:32:58,687 - INFO - seq=    9; pos=    73; strand=-; W-mer=GACCTGATGAG; Z=0.968813
#2012-06-16 11:32:58,687 - INFO - seq=    5; pos=    16; strand=-; W-mer=AACCTGAGCCG; Z=0.974733
#2012-06-16 11:32:58,687 - INFO - seq=    6; pos=    73; strand=+; W-mer=AACCTTAGGCG; Z=0.984093
#2012-06-16 11:32:58,687 - INFO - seq=    3; pos=    10; strand=+; W-mer=AACCTTAGGAT; Z=0.984245



#
# Print the instances
#
for instance in instance_finder.instances:
    seq, pos = data.pos_localise(instance.global_pos)
    W_mer = data.get_W_mer(W, instance.global_pos)
    if instance.rev_comp:
        W_mer = stempy.reverse_complement(W_mer)
    logging.info('seq=%5d; pos=%6d; strand=%s; W-mer=%s; Z=%4f', seq, pos, instance.rev_comp and '-' or '+', W_mer, instance.Z)

logging.info('Found %d instances', len(instance_finder.instances))
assert 13 == len(instance_finder.instances)
Esempio n. 3
0
#
# Set up the options
#
fasta_file = os.path.join(fasta_dir(), "random-seqs-two-motifs.fasta")
options = stempy.get_default_options()
options.output_dir = os.path.join("output", "test-2-motifs")
options.min_w = 8
options.max_w = 10
options.num_motifs = 2
options.meme_like_output = "two-motif-test-meme.txt"
meme_output = os.path.join(options.output_dir, options.meme_like_output)


#
# Run the STEME algorithm
#
algorithm = stempy.Algorithm(options)
algorithm(fasta_file)

#
# Make sure we can parse output with 2 motifs in it
#
predicted_sites = parse_meme_output_for_sites(meme_output)

#
# Calculate the consensuses for the 2 motifs
#
consensuses = [stempy.consensus_from_pssm(motif.model.bs.pssm.log_probs.values()) for motif in algorithm.motifs]
assert consensuses[0] == "AAACTCACTC" or stempy.reverse_complement(consensuses[0]) == "AAACTCACTC", consensuses[0]
assert consensuses[1] == "AACCTGTG" or stempy.reverse_complement(consensuses[1]) == "AACCTGTG", consensuses[1]