def do_test(self, seed, fasta_file): "Do one test case." # # Load sequences and build index # algorithm = stempy.Algorithm(self.options) algorithm._initialise(fasta_file) data = algorithm.input_sequences.data # # look for best W-mers # W = len(seed) num_to_find = 3 logging.info('Looking for %d best W-mers', num_to_find) best_w_mer_finder = stempy.create_best_w_mer_finder( data, algorithm.create_model_of_input(W), num_to_find ) logging.info('Seeding model with %s', seed) if W != len(seed): raise ValueError('Seed must be same length as motif.') best_w_mer_finder.model.bs.seed(seed, True) best_w_mer_finder.model.set_lambda_for_sites(best_w_mer_finder.data.num_sequences) best_w_mer_finder() if not best_w_mer_finder.best_w_mers: raise RuntimeError('Did not find any W-mers') # # Log best W-mers # for _eval in best_w_mer_finder.best_w_mers: seq, offset = data.pos_localise(_eval.global_pos) strand = _eval.rev_comp and '-' or '+' logging.info( 'Seed: %s; Site: %s; seq: % 2d; offset: % 4d; strand: %s; p(binding): %.2e; p(not binding): %.2e', seed, data.get_W_mer(W, _eval.global_pos), seq, offset, strand, _eval.Z, 1.-_eval.Z ) # # check we at least found the seed... # for _eval in best_w_mer_finder.best_w_mers: if data.get_W_mer(W, _eval.global_pos) == seed: break else: raise RuntimeError('Could not find seed in best W-mers') # # check we have no overlaps # localised_positions = [data.pos_localise(_eval.global_pos) for _eval in best_w_mer_finder.best_w_mers] check_w_mers_dont_overlap(W, localised_positions) assert not best_w_mer_finder.has_overlapping
def test_find_best_W_mers_2(self): """ Run best W-mer finder and check we found the seeds we wanted. """ self.options.min_num_sites = self.options.max_num_sites = num_to_find = 2 # load data and create STEME object fasta_file = os.path.normpath(get_fasta_file('T00759-small.fa')) # # Load sequences and build index # algorithm = stempy.Algorithm(self.options) algorithm._initialise(fasta_file) data = algorithm.input_sequences.data for seed in ( 'ATGCAGAAAAATTAAG', 'TTTAAAATACTTTAAA', ): # create and seed a model W = len(seed) model = algorithm.create_model_of_input(W) model.bs.seed(seed, True) model.set_lambda_for_sites(data.num_sequences) # look for best W-mers under model best_w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_to_find) best_w_mer_finder() avg_Z = 0. for _eval in best_w_mer_finder.best_w_mers: logging.info( 'Seed: %s; Site: %s; p(binding): %.2e; p(not binding): %.2e', seed, data.get_W_mer(W, _eval.global_pos), _eval.Z, 1.-_eval.Z ) avg_Z += _eval.Z logging.info('Seed: %s; Average Z: %.6f', seed, avg_Z / len(best_w_mer_finder.best_w_mers)) # # Check we found the seed # for _eval in best_w_mer_finder.best_w_mers: if data.get_W_mer(W, _eval.global_pos) == seed: break else: raise RuntimeError('Could not find seed in best W-mers') # # Log the product of p-values # best_w_mer_finder.update_model(num_to_find, use_pseudo_counts=False) logging.info('Seed: %s; log PoP: %.6f', seed, algorithm.significance.log_product_p_values(model))
def test_specific_case(self): """ Check we get the correct W-mers. """ fasta_file = os.path.normpath(get_fasta_file('T00759-tiny.fa')) seed = 'AAAACCCA' W = len(seed) num_sites = 4 self.options.max_num_sites = num_sites self.options.min_num_sites = num_sites # # Load sequences and build index # algorithm = stempy.Algorithm(self.options) algorithm._initialise(fasta_file) data = algorithm.input_sequences.data model = algorithm.create_model_of_input(W) model.bs.seed(seed, True) model.set_lambda_for_sites(data.num_sequences) # look for best W-mers under model best_w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_sites) best_w_mer_finder() if len(best_w_mer_finder.best_w_mers) < num_sites: if len(best_w_mer_finder.best_w_mers) != model.num_W_mers: raise ValueError('Did not find enough W-mers') # We want to get these W-mers # # 2011-08-09 10:11:32,846 - INFO - Z=8.00e-02; pos= 313 +; AAAACCCA; AAAACCCA # 2011-08-09 10:11:32,846 - INFO - Z=4.37e-02; pos= 668 -; TGAGTTTT; AAAACTCA # 2011-08-09 10:11:32,846 - INFO - Z=1.37e-02; pos= 710 -; TGGTTTTC; GAAAACCA # 2011-08-09 10:11:32,846 - INFO - Z=1.37e-02; pos= 681 -; TGGTTCTT; AAGAACCA # for wmer, (global_pos, rev_comp) in zip(best_w_mer_finder.best_w_mers, [(313, False), (668, True), (710, True), (681, True)]): if wmer.global_pos != global_pos and wmer.Z < model.calculate_Z(global_pos, rev_comp): raise ValueError('Got wrong W-mers')
def find_best_w_mers_for_seed(stem_algorithm, seed, options, log_level): "Find the best W-mers for the seed." num_to_find = options.max_num_sites logging.log( log_level, 'Finding best %d W-mers for seed %s', num_to_find, seed) _freqs, _freqs_with_pseudo_counts, model = stempy.Model( stem_algorithm.data, len(seed), options ) logging.debug('Seed pseudo-count: %f', model.bs.seed_pseudo_counts) logging.debug('Model lambda: %f', model.lambda_) model.bs.seed(seed, True) best_w_mer_finder = stempy.create_best_w_mer_finder( stem_algorithm.data, model, num_to_find) best_w_mer_finder() best_w_mers = list(best_w_mer_finder.best_w_mers)[-num_to_find:] avg_Z = 0. avg_distance = 0. for eval_ in best_w_mers: w_mer = stem_algorithm.data.get_W_mer(len(seed), eval_.global_pos) if eval_.rev_comp: w_mer = reverse_complement(w_mer) distance = hamming_distance(seed, w_mer) logging.log(log_level, 'Best W-mer: %s; Z=%.5f, distance=%d', w_mer, eval_.Z, distance) avg_Z += eval_.Z avg_distance += distance best_w_mer_finder.update_model(num_to_find, use_pseudo_counts=False) log_pop = model.log_product_of_pvalues avg_Z /= len(best_w_mers) logging.log(log_level, 'Seed: %s; log PoP: %.6f', seed, log_pop) logging.log(log_level, 'Seed: %s; Average Z=%.7f', seed, avg_Z) logging.log(log_level, 'Seed: %s; Average distance=%.2f', seed, avg_distance / len(best_w_mers)) logging.debug('Frequencies: %s' % ' '.join(map(str, (model.bg.freqs.freq(x) for x in xrange(4))))) return avg_Z, log_pop
def test_we_get_all_W_mers_we_asked_for(self): """ Check that we are not short-changed on the number of W-mers we asked for. """ fasta_file = os.path.normpath(get_fasta_file('T00759-small.fa')) num_sites = [2, 4, 8, 16, 32] self.options.max_num_sites = max(num_sites) self.options.min_num_sites = min(num_sites) # # Load sequences and build index # algorithm = stempy.Algorithm(self.options) algorithm._initialise(fasta_file) data = algorithm.input_sequences.data for seed in ( 'GCTAGCTAGCGG', 'ATGCAGAAAAATTAAG', 'TTTAAAATACTTTAAA', ): # seed a model logging.info('Using seed %s', seed) W = len(seed) model = algorithm.create_model_of_input(W) model.bs.seed(seed, True) model.set_lambda_for_sites(data.num_sequences) for num_to_find in num_sites: # look for best W-mers under model best_w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_to_find) best_w_mer_finder() if len(best_w_mer_finder.best_w_mers) < num_to_find: if len(best_w_mer_finder.best_w_mers) != model.num_W_mers: logging.warning('Found %d W-mers', len(best_w_mer_finder.best_w_mers)) logging.warning('%d W-mers available', model.num_W_mers) logging.warning('Wanted %d W-mers', num_to_find) raise ValueError('Did not find enough W-mers')
bs_model = stempy.PssmBindingSiteModel(stempy.initialise_uniform_pssm(W, options.alphabet_size)) bs_model.seed(seed) # whole model model = stempy.Model(data, bs_model, bg_model, _lambda=0.) Z_threshold = .3 with Timer(msg='find instances with Z>%f' % Z_threshold): instance_finder = stempy.FindInstances(data, model, Z_threshold) instance_finder() logging.info('Found %d instances', len(instance_finder.instances)) num_W_mers_to_find = 10000 with Timer(msg='find %d best W-mers' % num_W_mers_to_find): w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_W_mers_to_find) w_mer_finder() logging.info('Found %d instances', len(w_mer_finder.best_w_mers)) def global_overlap(pos1, pos2, W): return abs(pos1 - pos2) < W def get_non_overlapping(instances, W): instances.sort() instances.reverse() result = [] for i in instances: for better in result: if global_overlap(i.global_pos, better.global_pos, W): break