def do_test(self, seed, fasta_file):
        "Do one test case."
                
        #
        # Load sequences and build index
        #
        algorithm = stempy.Algorithm(self.options)
        algorithm._initialise(fasta_file)
        data = algorithm.input_sequences.data

        #
        # look for best W-mers
        #
        W = len(seed)
        num_to_find = 3
        logging.info('Looking for %d best W-mers', num_to_find)
        best_w_mer_finder = stempy.create_best_w_mer_finder(
            data, 
            algorithm.create_model_of_input(W), 
            num_to_find
        )
        logging.info('Seeding model with %s', seed)
        if W != len(seed):
            raise ValueError('Seed must be same length as motif.')
        best_w_mer_finder.model.bs.seed(seed, True)
        best_w_mer_finder.model.set_lambda_for_sites(best_w_mer_finder.data.num_sequences)
        best_w_mer_finder()
        if not best_w_mer_finder.best_w_mers:
            raise RuntimeError('Did not find any W-mers')
        
        #
        # Log best W-mers
        #
        for _eval in best_w_mer_finder.best_w_mers:
            seq, offset = data.pos_localise(_eval.global_pos)
            strand = _eval.rev_comp and '-' or '+'
            logging.info(
                'Seed: %s; Site: %s; seq: % 2d; offset: % 4d; strand: %s; p(binding): %.2e; p(not binding): %.2e',
                seed, data.get_W_mer(W, _eval.global_pos), seq, offset, strand, _eval.Z, 1.-_eval.Z
            )

        #
        # check we at least found the seed...   
        #
        for _eval in best_w_mer_finder.best_w_mers:
            if data.get_W_mer(W, _eval.global_pos) == seed:
                break
        else:
            raise RuntimeError('Could not find seed in best W-mers')

        #
        # check we have no overlaps
        #
        localised_positions = [data.pos_localise(_eval.global_pos) for _eval in best_w_mer_finder.best_w_mers]
        check_w_mers_dont_overlap(W, localised_positions)
        assert not best_w_mer_finder.has_overlapping
    def test_find_best_W_mers_2(self):
        """
        Run best W-mer finder and check we found the seeds we wanted.
        """
        self.options.min_num_sites = self.options.max_num_sites = num_to_find = 2
        
        # load data and create STEME object
        fasta_file = os.path.normpath(get_fasta_file('T00759-small.fa'))
        
        #
        # Load sequences and build index
        #
        algorithm = stempy.Algorithm(self.options)
        algorithm._initialise(fasta_file)
        data = algorithm.input_sequences.data

        for seed in (
            'ATGCAGAAAAATTAAG',
            'TTTAAAATACTTTAAA',
        ):
            # create and seed a model
            W = len(seed)
            model = algorithm.create_model_of_input(W)
            model.bs.seed(seed, True)
            model.set_lambda_for_sites(data.num_sequences)
        
            # look for best W-mers under model
            best_w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_to_find)
            best_w_mer_finder()
            avg_Z = 0.
            for _eval in best_w_mer_finder.best_w_mers:
                logging.info(
                    'Seed: %s; Site: %s; p(binding): %.2e; p(not binding): %.2e',
                    seed, data.get_W_mer(W, _eval.global_pos), _eval.Z, 1.-_eval.Z
                )
                avg_Z += _eval.Z
            logging.info('Seed: %s; Average Z: %.6f', seed, avg_Z / len(best_w_mer_finder.best_w_mers))
            
            #
            # Check we found the seed
            #
            for _eval in best_w_mer_finder.best_w_mers:
                if data.get_W_mer(W, _eval.global_pos) == seed:
                    break
            else:
                raise RuntimeError('Could not find seed in best W-mers')
            
            #
            # Log the product of p-values
            #
            best_w_mer_finder.update_model(num_to_find, use_pseudo_counts=False)
            logging.info('Seed: %s; log PoP: %.6f', seed, algorithm.significance.log_product_p_values(model))
    def test_specific_case(self):  
        """
        Check we get the correct W-mers.
        """
        fasta_file = os.path.normpath(get_fasta_file('T00759-tiny.fa'))
        seed = 'AAAACCCA'
        W = len(seed)
        num_sites = 4
        self.options.max_num_sites = num_sites
        self.options.min_num_sites = num_sites
        
        #
        # Load sequences and build index
        #
        algorithm = stempy.Algorithm(self.options)
        algorithm._initialise(fasta_file)
        data = algorithm.input_sequences.data

        model = algorithm.create_model_of_input(W)
        model.bs.seed(seed, True)
        model.set_lambda_for_sites(data.num_sequences)

        # look for best W-mers under model
        best_w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_sites)
        best_w_mer_finder()
        if len(best_w_mer_finder.best_w_mers) < num_sites:
            if len(best_w_mer_finder.best_w_mers) != model.num_W_mers:
                raise ValueError('Did not find enough W-mers')
        
        # We want to get these W-mers
        # 
        # 2011-08-09 10:11:32,846 - INFO - Z=8.00e-02; pos=      313 +; AAAACCCA; AAAACCCA
        # 2011-08-09 10:11:32,846 - INFO - Z=4.37e-02; pos=      668 -; TGAGTTTT; AAAACTCA
        # 2011-08-09 10:11:32,846 - INFO - Z=1.37e-02; pos=      710 -; TGGTTTTC; GAAAACCA
        # 2011-08-09 10:11:32,846 - INFO - Z=1.37e-02; pos=      681 -; TGGTTCTT; AAGAACCA
        # 
        for wmer, (global_pos, rev_comp) in zip(best_w_mer_finder.best_w_mers, [(313, False), (668, True), (710, True), (681, True)]):
            if wmer.global_pos != global_pos and wmer.Z < model.calculate_Z(global_pos, rev_comp):
                raise ValueError('Got wrong W-mers')
Beispiel #4
0
def find_best_w_mers_for_seed(stem_algorithm, seed, options, log_level):
    "Find the best W-mers for the seed."
    num_to_find = options.max_num_sites
    logging.log(
        log_level, 'Finding best %d W-mers for seed %s', num_to_find, seed)
    _freqs, _freqs_with_pseudo_counts, model = stempy.Model(
        stem_algorithm.data,
        len(seed),
        options
    )
    logging.debug('Seed pseudo-count: %f', model.bs.seed_pseudo_counts)
    logging.debug('Model lambda: %f', model.lambda_)
    model.bs.seed(seed, True)
    best_w_mer_finder = stempy.create_best_w_mer_finder(
        stem_algorithm.data, model, num_to_find)
    best_w_mer_finder()
    best_w_mers = list(best_w_mer_finder.best_w_mers)[-num_to_find:]
    avg_Z = 0.
    avg_distance = 0.
    for eval_ in best_w_mers:
        w_mer = stem_algorithm.data.get_W_mer(len(seed), eval_.global_pos)
        if eval_.rev_comp:
            w_mer = reverse_complement(w_mer)
        distance = hamming_distance(seed, w_mer)
        logging.log(log_level, 'Best W-mer: %s; Z=%.5f, distance=%d',
                    w_mer, eval_.Z, distance)
        avg_Z += eval_.Z
        avg_distance += distance
    best_w_mer_finder.update_model(num_to_find, use_pseudo_counts=False)
    log_pop = model.log_product_of_pvalues
    avg_Z /= len(best_w_mers)
    logging.log(log_level, 'Seed: %s; log PoP: %.6f', seed, log_pop)
    logging.log(log_level, 'Seed: %s; Average Z=%.7f', seed, avg_Z)
    logging.log(log_level, 'Seed: %s; Average distance=%.2f',
                seed, avg_distance / len(best_w_mers))
    logging.debug('Frequencies: %s' %
                  ' '.join(map(str, (model.bg.freqs.freq(x) for x in xrange(4)))))
    return avg_Z, log_pop
    def test_we_get_all_W_mers_we_asked_for(self):
        """
        Check that we are not short-changed on the number of W-mers we asked for.
        """
        fasta_file = os.path.normpath(get_fasta_file('T00759-small.fa'))
        num_sites = [2, 4, 8, 16, 32]
        self.options.max_num_sites = max(num_sites)
        self.options.min_num_sites = min(num_sites)
        
        #
        # Load sequences and build index
        #
        algorithm = stempy.Algorithm(self.options)
        algorithm._initialise(fasta_file)
        data = algorithm.input_sequences.data

        for seed in (
            'GCTAGCTAGCGG',
            'ATGCAGAAAAATTAAG',
            'TTTAAAATACTTTAAA',
        ):
            # seed a model
            logging.info('Using seed %s', seed)
            W = len(seed)
            model = algorithm.create_model_of_input(W)
            model.bs.seed(seed, True)
            model.set_lambda_for_sites(data.num_sequences)
        
            for num_to_find in num_sites:
                # look for best W-mers under model
                best_w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_to_find)
                best_w_mer_finder()
                if len(best_w_mer_finder.best_w_mers) < num_to_find:
                    if len(best_w_mer_finder.best_w_mers) != model.num_W_mers:
                        logging.warning('Found %d W-mers', len(best_w_mer_finder.best_w_mers))
                        logging.warning('%d W-mers available', model.num_W_mers)
                        logging.warning('Wanted %d W-mers', num_to_find)
                        raise ValueError('Did not find enough W-mers')
bs_model = stempy.PssmBindingSiteModel(stempy.initialise_uniform_pssm(W, options.alphabet_size))
bs_model.seed(seed)

# whole model
model = stempy.Model(data, bs_model, bg_model, _lambda=0.)

Z_threshold = .3
with Timer(msg='find instances with Z>%f' % Z_threshold):
    instance_finder = stempy.FindInstances(data, model, Z_threshold)
    instance_finder()
    logging.info('Found %d instances', len(instance_finder.instances))


num_W_mers_to_find = 10000
with Timer(msg='find %d best W-mers' % num_W_mers_to_find):
    w_mer_finder = stempy.create_best_w_mer_finder(data, model, num_W_mers_to_find)
    w_mer_finder()
    logging.info('Found %d instances', len(w_mer_finder.best_w_mers))
    

def global_overlap(pos1, pos2, W):
    return abs(pos1 - pos2) < W

def get_non_overlapping(instances, W):
    instances.sort()
    instances.reverse()
    result = []
    for i in instances:
        for better in result:
            if global_overlap(i.global_pos, better.global_pos, W):
                break