def evaluate_mosaics(max_mosaics=6, max_order=3): """ Evaluate different mosaic models on chip-chip fragments. """ from gapped_pssms import data sequences = data.training_test_sequences() mosaic_sizes = range(1, max_mosaics + 1) orders = range(max_order + 1) preprocessed_sequences = [([hmm.preprocess_sequence(s) for s in training], [hmm.preprocess_sequence(s) for s in test]) for training, test in sequences] result = list() for order in orders: converter = hmm.MarkovOrderConverter(alphabet_size=4, order=order) order_n_seqs = [([converter.to_order_n(s) for s in training], [converter.to_order_n(s) for s in test]) for training, test in sequences] for num_mosaics in mosaic_sizes: LL = 0. for training_seqs, test_seqs in order_n_seqs: model = hmm.as_model( create_mosaic_model(num_mosaics=num_mosaics, p_transition=0., alphabet_size=4, order=order, dirichlet_prior_strength=10.)) model.baum_welch(training_seqs) LL += sum(model.LL(s) for s in test_seqs) logging.info('Order: %d; # mosaics: %d; LL: %f', order, num_mosaics, LL) result.append((order, num_mosaics, LL)) return result
def load_seqs(filename): "Load and convert sequences from fasta file." logging.info('Loading sequences: %s', filename) sequences = dict(sequences_from_fasta(filename)) numpy_seqs = dict((desc, hmm.preprocess_sequence(seq_to_numpy(seq))) for desc, (seq, tally) in sequences.iteritems()) tally = sum(imap(N.array, (tally for desc, (seq, tally) in sequences.iteritems()))) logging.info('Loaded %d sequences with %d bases', len(sequences), sum(imap(len, (seq for seq, tally in sequences.values())))) return numpy_seqs, tally
def evaluate_mosaics(max_mosaics=6, max_order=3): """ Evaluate different mosaic models on chip-chip fragments. """ from gapped_pssms import data sequences = data.training_test_sequences() mosaic_sizes = range(1,max_mosaics+1) orders = range(max_order+1) preprocessed_sequences = [ ( [hmm.preprocess_sequence(s) for s in training], [hmm.preprocess_sequence(s) for s in test] ) for training, test in sequences ] result = list() for order in orders: converter = hmm.MarkovOrderConverter(alphabet_size=4, order=order) order_n_seqs = [ ( [converter.to_order_n(s) for s in training], [converter.to_order_n(s) for s in test] ) for training, test in sequences ] for num_mosaics in mosaic_sizes: LL = 0. for training_seqs, test_seqs in order_n_seqs: model = hmm.as_model( create_mosaic_model( num_mosaics=num_mosaics, p_transition=0., alphabet_size=4, order=order, dirichlet_prior_strength=10. ) ) model.baum_welch(training_seqs) LL += sum(model.LL(s) for s in test_seqs) logging.info('Order: %d; # mosaics: %d; LL: %f', order, num_mosaics, LL) result.append((order, num_mosaics, LL)) return result
def load_seqs(filename): "Load and convert sequences from fasta file." logging.info('Loading sequences: %s', filename) sequences = dict(sequences_from_fasta(filename)) numpy_seqs = dict((desc, hmm.preprocess_sequence(seq_to_numpy(seq))) for desc, (seq, tally) in sequences.iteritems()) tally = sum( imap(N.array, (tally for desc, (seq, tally) in sequences.iteritems()))) logging.info('Loaded %d sequences with %d bases', len(sequences), sum(imap(len, (seq for seq, tally in sequences.values())))) return numpy_seqs, tally
[.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], ]) logging.info('Calculating log scores') pssm_scores = hmm.calculate_log_scores(nucleo_dists) logging.info('Calculating complementary scores') comp_scores = hmm.calculate_complementary_scores(pssm_scores) logging.info('Preprocessing sequence') seq = hmm.preprocess_sequence( N.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4])) logging.info('Scoring sequence') logging.info(hmm.score_sequence(pssm_scores, seq)) logging.info('Scoring sequence with complementary scores') logging.info(hmm.score_sequence(comp_scores, seq)) logging.info(hmm.max_score_in_sequence(pssm_scores, seq)) logging.info(hmm.max_score_in_sequence(comp_scores, seq)) long_seqs = [R.random_integers(0, 4, size=10000) for i in xrange(100)] long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs) logging.info('Starting to time max scores on long sequences.') start = time.time() max_scores = hmm.max_scores_in_sequences(pssm_scores,
def __call__(self, sequences, bg_model=None): """ Run the motif finding algorithm. """ logging.info('Looking for at least %d PSSMs', self.options.num_pssms) if self.options.max_L_mers_to_evaluate < self.options.num_pssms: raise ValueError('Cannot find any more PSSMs than L-mers are evaluated.') num_bases = sum(len(s) for s in sequences) logging.info('Running single gap algorithm on %d sequences with %d bases', len(sequences), num_bases) preprocessed_sequences = [hmm.preprocess_sequence(s) for s in sequences] start = time.clock() seeds, bg_model = generate_seeds( sequences, preprocessed_sequences, self.options ) logging.info('Generating %d seeds took %.1f seconds', len(seeds), time.clock() - start) # try the best few seeds start = time.clock() num_to_examine = self.options.num_seeds_to_examine and self.options.num_seeds_to_examine or 2 * self.options.num_pssms logging.info('Examining %d/%d seeds', num_to_examine, len(seeds)) p_one_per_seq = len(preprocessed_sequences) / float(num_bases) # expect one per sequence self.p_binding_site = p_one_per_seq * self.options.p_binding_site_scale logging.info( 'HMM p(binding site) parameter estimated as %f (1 site/seq) and adjusted to %f by scaling parameter', p_one_per_seq, self.p_binding_site, ) results = list( (seed, self.try_seed(seed, len(sequences), num_bases, preprocessed_sequences)) for seed, i in zip(seeds, xrange(num_to_examine)) ) # keep only those results that succeeded results = filter(lambda x: x[1], results) logging.info('Got PSSMs for %d seeds in %.1f seconds', len(results), time.clock() - start) # define a function that scores results def score_result(result): seed, result = result L_mer, count, num_seqs, gap, score = seed model, builder, num_sites, num_seqs_with_site = result emissions, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1) return geometric_mean(( calculate_first_order_entropy_score(emissions), calculate_information_content_score(emissions), num_seqs_with_site / float(len(sequences)) )) # sort by scores scored_results = [(score_result(result), result) for result in results] scored_results.sort(reverse=True) # remove those PSSMs that do not score highly enough logging.info('Removing PSSMs with low scores.') while len(scored_results) > self.options.num_pssms and scored_results[-1][0] < self.options.pssm_score_threshold: scored_results.pop() logging.info('%d PSSMs scored highly enough', len(scored_results)) # examine results for i, (score, (seed, result)) in enumerate(scored_results): logging.info('************** PSSM %d **************', i) L_mer, count, num_seqs, gap, seed_score = seed model, builder, num_sites, num_seqs_with_site = result logging.info( 'Seed %s with gap at %d had %d hits in %d/%d sequences', L_mer, gap, count, num_seqs, len(sequences) ) logging.info('Seed score: %f', seed_score) image_file = os.path.join(self.options.output_dir, '%s-%03d' % (self.options.tag, i)) pssm_def_file = os.path.join(self.options.output_dir, '%s-%03d.pssm' % (self.options.tag, i)) logging.info( 'HMM found %d sites. %d/%d sequences have at least one site', num_sites, num_seqs_with_site, len(sequences) ) self.examine_model(model, builder, sequences, image_file, pssm_def_file) logging.info('Score: %g', score) emissions, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1) return [r[1] for r in results]
AATACTATTACTATACCCACGACCTCCAGAAATTCACTGGATAACCAGTAAGACAACTTCTACTCATTTCTTCATATTCC TACTTATTCAAGTTGTAGCCTTCATAGTTGATAAAAAATCAGCACACATTAAGAAAACAATAACAGAACTATTTTCTTCA CATGACTTTTATTCCTTAATCCAGACTGTTAAAAGGACTGCAAGACAAATTGTTTTTCAATCAGATTTTTTTCTCCACCA GATGTCTATGTGAATTTCATATTGTTTTAGACAAAAATGCTCATTCCTTCGGTCTAAGTACTATGTCATATTTTGTTTTT TCAAGCCTTCAAATTTTGTGCTGGTGGTTACTTCATATACATTCTATGGTTAATCTTTAAAGAGAAGTTTTAAAAGTCTG ATTCAAAATTTCAGTTCACTCGCTATGTATTTTAAAAATTAAAATTTATGAAATTCAATTTTAAAAATCTAAAAGTTATC TAAAAAGGTCTATGACTTATCAAATTTCAATAAGCTGACTGTTAGCAGTATTAAAAAATATTAAATATGCTAACANNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATACATAAAGGGAATAGGCAGAGTTCACAGATT AATATTTCTTACCTCTACAATAAGAAGAAATACCTTGTTCTATGAGCAGCTGCCATACTTTCAGACATGTTTCTGACTTT TAGATAATTAACAAATCCTCTGAAGAAAAGGAGCAGGCCTGAGAAGGTTGAAATAATATGGATATACTATGTTTTTATAC AGAAAAGGGCAAGATAAATTTAAAGTAGACAATTATAAACANNNNNNNNNNNNNNNNNGGA""".replace('\n', '') def convert_seq(seq): return numpy.array(corebio.seq.Seq(seq, alphabet=corebio.seq.reduced_nucleic_alphabet).ords()) old_pp = hmm.preprocess_sequence(convert_seq(old_seq)) new_pp = hmm.preprocess_sequence(convert_seq(new_seq)) #meme_dir = '/home/reid/Analysis/GappedPssms/MEME/x-validate' #pssm_file = os.path.join(meme_dir, 'T00671-1.pssm') pssm_file = '/home/john/Analysis/GappedPssms/MEME/x-validate/vm-T00671-motif-h2-v9-x1.pssm' semi_parsed_models = list(parse_models(open(pssm_file))) if len(semi_parsed_models) > 1: print >> sys.stderr, 'For the moment we can only handle one model at a time.' sys.exit(-1) parsed = semi_parsed_models[0] logging.info(str(parsed)) model, traits = build_hmm_from_semi_parsed(parsed) classifier = make_classifier(model) def test_seq(seq):
[.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], [.85, .05, .05, .05], [.05, .85, .05, .05], ] ) logging.info('Calculating log scores') pssm_scores = hmm.calculate_log_scores(nucleo_dists) logging.info('Calculating complementary scores') comp_scores = hmm.calculate_complementary_scores(pssm_scores) logging.info('Preprocessing sequence') seq = hmm.preprocess_sequence(N.array([0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,4])) logging.info('Scoring sequence') logging.info(hmm.score_sequence(pssm_scores, seq)) logging.info('Scoring sequence with complementary scores') logging.info(hmm.score_sequence(comp_scores, seq)) logging.info(hmm.max_score_in_sequence(pssm_scores, seq)) logging.info(hmm.max_score_in_sequence(comp_scores, seq)) long_seqs = [R.random_integers(0,4,size=10000) for i in xrange(100)] long_seqs_preprocessed = hmm.preprocess_sequences(long_seqs) logging.info('Starting to time max scores on long sequences.') start = time.time() max_scores = hmm.max_scores_in_sequences(pssm_scores, long_seqs_preprocessed)