def find_best_w_mers_for_seed(stem_algorithm, seed, options, log_level): "Find the best W-mers for the seed." num_to_find = options.max_num_sites logging.log( log_level, 'Finding best %d W-mers for seed %s', num_to_find, seed) _freqs, _freqs_with_pseudo_counts, model = stempy.Model( stem_algorithm.data, len(seed), options ) logging.debug('Seed pseudo-count: %f', model.bs.seed_pseudo_counts) logging.debug('Model lambda: %f', model.lambda_) model.bs.seed(seed, True) best_w_mer_finder = stempy.create_best_w_mer_finder( stem_algorithm.data, model, num_to_find) best_w_mer_finder() best_w_mers = list(best_w_mer_finder.best_w_mers)[-num_to_find:] avg_Z = 0. avg_distance = 0. for eval_ in best_w_mers: w_mer = stem_algorithm.data.get_W_mer(len(seed), eval_.global_pos) if eval_.rev_comp: w_mer = reverse_complement(w_mer) distance = hamming_distance(seed, w_mer) logging.log(log_level, 'Best W-mer: %s; Z=%.5f, distance=%d', w_mer, eval_.Z, distance) avg_Z += eval_.Z avg_distance += distance best_w_mer_finder.update_model(num_to_find, use_pseudo_counts=False) log_pop = model.log_product_of_pvalues avg_Z /= len(best_w_mers) logging.log(log_level, 'Seed: %s; log PoP: %.6f', seed, log_pop) logging.log(log_level, 'Seed: %s; Average Z=%.7f', seed, avg_Z) logging.log(log_level, 'Seed: %s; Average distance=%.2f', seed, avg_distance / len(best_w_mers)) logging.debug('Frequencies: %s' % ' '.join(map(str, (model.bg.freqs.freq(x) for x in xrange(4))))) return avg_Z, log_pop
instance_finder.instances.sort() # at least 13 instances in sequences #2012-06-16 11:32:58,686 - INFO - seq= 5; pos= 67; strand=+; W-mer=AACCTCGAGAG; Z=0.749857 #2012-06-16 11:32:58,686 - INFO - seq= 0; pos= 48; strand=+; W-mer=AACCTAAGAAA; Z=0.814953 #2012-06-16 11:32:58,686 - INFO - seq= 3; pos= 51; strand=+; W-mer=AAACTGTGGCT; Z=0.819370 #2012-06-16 11:32:58,686 - INFO - seq= 5; pos= 79; strand=+; W-mer=AAGCTAAAGAG; Z=0.827948 #2012-06-16 11:32:58,687 - INFO - seq= 3; pos= 36; strand=-; W-mer=AAGCTTATCAG; Z=0.862206 #2012-06-16 11:32:58,687 - INFO - seq= 5; pos= 97; strand=-; W-mer=GAACTGGGGAT; Z=0.912242 #2012-06-16 11:32:58,687 - INFO - seq= 2; pos= 47; strand=+; W-mer=AAACTTGGGAA; Z=0.919355 #2012-06-16 11:32:58,687 - INFO - seq= 1; pos= 6; strand=+; W-mer=AACCTTAGACG; Z=0.963969 #2012-06-16 11:32:58,687 - INFO - seq= 6; pos= 46; strand=-; W-mer=AAGCTGGGGAC; Z=0.968255 #2012-06-16 11:32:58,687 - INFO - seq= 9; pos= 73; strand=-; W-mer=GACCTGATGAG; Z=0.968813 #2012-06-16 11:32:58,687 - INFO - seq= 5; pos= 16; strand=-; W-mer=AACCTGAGCCG; Z=0.974733 #2012-06-16 11:32:58,687 - INFO - seq= 6; pos= 73; strand=+; W-mer=AACCTTAGGCG; Z=0.984093 #2012-06-16 11:32:58,687 - INFO - seq= 3; pos= 10; strand=+; W-mer=AACCTTAGGAT; Z=0.984245 # # Print the instances # for instance in instance_finder.instances: seq, pos = data.pos_localise(instance.global_pos) W_mer = data.get_W_mer(W, instance.global_pos) if instance.rev_comp: W_mer = stempy.reverse_complement(W_mer) logging.info('seq=%5d; pos=%6d; strand=%s; W-mer=%s; Z=%4f', seq, pos, instance.rev_comp and '-' or '+', W_mer, instance.Z) logging.info('Found %d instances', len(instance_finder.instances)) assert 13 == len(instance_finder.instances)
# # Set up the options # fasta_file = os.path.join(fasta_dir(), "random-seqs-two-motifs.fasta") options = stempy.get_default_options() options.output_dir = os.path.join("output", "test-2-motifs") options.min_w = 8 options.max_w = 10 options.num_motifs = 2 options.meme_like_output = "two-motif-test-meme.txt" meme_output = os.path.join(options.output_dir, options.meme_like_output) # # Run the STEME algorithm # algorithm = stempy.Algorithm(options) algorithm(fasta_file) # # Make sure we can parse output with 2 motifs in it # predicted_sites = parse_meme_output_for_sites(meme_output) # # Calculate the consensuses for the 2 motifs # consensuses = [stempy.consensus_from_pssm(motif.model.bs.pssm.log_probs.values()) for motif in algorithm.motifs] assert consensuses[0] == "AAACTCACTC" or stempy.reverse_complement(consensuses[0]) == "AAACTCACTC", consensuses[0] assert consensuses[1] == "AACCTGTG" or stempy.reverse_complement(consensuses[1]) == "AACCTGTG", consensuses[1]