def prior(self, model): tmp_model = hmm.as_model(self.new_model( generate_uniform=True )) prior = hmm.ModelPrior(self.N(), self.M()) prior.A = tmp_model.A * self.prior_strength prior.B = tmp_model.B * self.prior_strength prior.pi = tmp_model.pi * self.prior_strength return prior
def load(self, index): order, num_mosaics, fragment = index filename = self.path_for(index) if not os.access(filename, os.R_OK): raise RuntimeError('Have no model for %s' % self.index_as_string(index)) builder = hmm.pssm.ModelBuilder(order) return hmm.as_model(builder.load_background_mosaic_model(filename))
def evaluate_mosaics(max_mosaics=6, max_order=3): """ Evaluate different mosaic models on chip-chip fragments. """ from gapped_pssms import data sequences = data.training_test_sequences() mosaic_sizes = range(1, max_mosaics + 1) orders = range(max_order + 1) preprocessed_sequences = [([hmm.preprocess_sequence(s) for s in training], [hmm.preprocess_sequence(s) for s in test]) for training, test in sequences] result = list() for order in orders: converter = hmm.MarkovOrderConverter(alphabet_size=4, order=order) order_n_seqs = [([converter.to_order_n(s) for s in training], [converter.to_order_n(s) for s in test]) for training, test in sequences] for num_mosaics in mosaic_sizes: LL = 0. for training_seqs, test_seqs in order_n_seqs: model = hmm.as_model( create_mosaic_model(num_mosaics=num_mosaics, p_transition=0., alphabet_size=4, order=order, dirichlet_prior_strength=10.)) model.baum_welch(training_seqs) LL += sum(model.LL(s) for s in test_seqs) logging.info('Order: %d; # mosaics: %d; LL: %f', order, num_mosaics, LL) result.append((order, num_mosaics, LL)) return result
def learn_bg_model( sequences, num_mosaics=4, order=3, tolerance_per_base=7e-5 ): """ @return: (bg_model, converted_sequences) """ bg_model = hmm.as_model( create_mosaic_model( num_mosaics=4, p_transition=.1, alphabet_size=4, order=3, dirichlet_prior_strength=.3 ) ) converted_seqs = [bg_model.converter.to_order_n(s) for s in sequences] def _callback(LL): logging.debug('Background model LL: %f', LL) return True tolerance=tolerance_per_base*sum(len(s) for s in sequences) logging.info('Learning background model with %d mosaics of order %d, tolerance=%f', num_mosaics, order, tolerance) start = time.time() LL, iterations = bg_model.baum_welch(converted_seqs, tolerance=tolerance, callback=_callback) logging.info('Achieved LL=%f after %d iterations and %f seconds', LL, iterations, time.time() - start) return bg_model, converted_seqs
def _model_for_L_mer(self, L_mer, gap_index, p_binding_site): """ Create a model initialised by this K-mer. """ # get the start position of the K-mer and a builder to make the model mer_len = len(L_mer) start, builder = self._make_builder(gap_index, mer_len) # get the emission distribution nucleo_dist = nucleo_dist_from_mer( seq_to_numpy(L_mer), self.options.pseudo_count_for_model_initialisation, gap_index=gap_index ) emissions = numpy.ones((self.options.K,4))/4. emissions[start:start+mer_len+1] = nucleo_dist # build the model pssm, in_states, out_states = builder.create( p_gap=.5, emissions=emissions ) model = hmm.as_model( single_gap.add_to_simple_background_model( model=pssm, in_states=in_states, out_states=out_states, p_binding_site=p_binding_site ) ) #print model.A #from IPython.Debugger import Pdb; Pdb().set_trace(); return model
def create_test_data_generating_model(self, dirichlet_strength=.05): data_generating_model = hmm.as_model(self.new_model(dirichlet_strength = .05)) for i in xrange(data_generating_model.N): data_generating_model.set_initial(i, 0.0) for i in xrange(self.num_background_mosaics): data_generating_model.set_initial(i, 1.0 / self.num_background_mosaics) data_generating_model.normalise() return data_generating_model
def build_hmm_model(freqs, gaps, p_binding_site=.001): "@return: A hmm.Model representing the gapped PWM defined by the arguments." model_by_states = build_model_by_states(freqs, gaps, p_binding_site=p_binding_site) model = hmm.as_model(model_by_states) model.normalise() return model
def log_info(self, model, log): model = hmm.as_model(model) BaseTraits.log_info(self, model, log) log.info('Adjusted IC/base = %.4f' % self.information_content_per_base(model)) A = model.A for k in xrange(self.K): p_gap = self.p_gap_for_model(model, k) if p_gap > self.gap_threshold: log.info('p(gap at %d) = %.4f' % (k, p_gap))
def test_traits(self): from hmm.pssm import create_background_model, seq_to_numpy from infpy import check_is_close_2 for order in [0, 1, 2]: num_background_mosaics = 4 traits = hmm.pssm.GappedPssmTraits( K=7, p_binding_site=.1, background_order=order, num_background_mosaics=num_background_mosaics, background_model_creator=create_background_model ) emission_dists = [ [ 1., 0., 0., 0. ], [ 0., 1., 0., 0. ], [ 0., 1., 0., 0. ], [ 1., 0., 0., 0. ], [ 0., 0., 1., 0. ], [ 0., 0., 0., 1. ], [ 0., 0., 0., 1. ], [ 0., 0., 0., 1. ], [ 0., 0., 1., 0. ], [ 0., 1., 0., 0. ], [ 1., 0., 0., 0. ], [ 0., 1., 0., 0. ], [ 0., 0., 0., 1. ], ] m=hmm.as_model(traits.new_model(emission_dists)) # check we have parameterised the reverse complement gaps correctly assert m.get_transition_parameterisation(29,28).idx == m.get_transition_parameterisation(14,15).idx assert m.get_transition_parameterisation(29,28).idx != m.get_transition_parameterisation(4,5).idx # check the reverse complement states are correct B = m.B for n in xrange(m.N): for o in xrange(m.M): rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(n,o) assert check_is_close_2( B[rev_comp_state,rev_comp_obs], B[n,o] ), ( '%d,%d %d,%d: %f %f' % ( rev_comp_state,rev_comp_obs,n,o,B[rev_comp_state,rev_comp_obs],B[n,o] ) ) # check viterbi gives correct result test_seq = 'acgtgat' # matches dist above test_seq_order_0 = hmm.pssm.seq_to_numpy(test_seq) test_seq_order_n = m.converter.to_order_n(test_seq_order_0) LL, states = m.viterbi(test_seq_order_n) for i, state in enumerate(states): assert (state-num_background_mosaics)/2 == i
def test_traits(self): from hmm.pssm import create_background_model, seq_to_numpy from infpy import check_is_close_2 for order in [0, 1, 2]: num_background_mosaics = 4 traits = hmm.pssm.GappedPssmTraits( K=7, p_binding_site=.1, background_order=order, num_background_mosaics=num_background_mosaics, background_model_creator=create_background_model) emission_dists = [ [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [1., 0., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], [0., 0., 0., 1.], [0., 0., 0., 1.], [0., 0., 1., 0.], [0., 1., 0., 0.], [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 0., 1.], ] m = hmm.as_model(traits.new_model(emission_dists)) # check we have parameterised the reverse complement gaps correctly assert m.get_transition_parameterisation( 29, 28).idx == m.get_transition_parameterisation(14, 15).idx assert m.get_transition_parameterisation( 29, 28).idx != m.get_transition_parameterisation(4, 5).idx # check the reverse complement states are correct B = m.B for n in xrange(m.N): for o in xrange(m.M): rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement( n, o) assert check_is_close_2( B[rev_comp_state, rev_comp_obs], B[n, o]), ('%d,%d %d,%d: %f %f' % (rev_comp_state, rev_comp_obs, n, o, B[rev_comp_state, rev_comp_obs], B[n, o])) # check viterbi gives correct result test_seq = 'acgtgat' # matches dist above test_seq_order_0 = hmm.pssm.seq_to_numpy(test_seq) test_seq_order_n = m.converter.to_order_n(test_seq_order_0) LL, states = m.viterbi(test_seq_order_n) for i, state in enumerate(states): assert (state - num_background_mosaics) / 2 == i
def uniform_bg_model(): bg_model = hmm.as_model( create_mosaic_model( num_mosaics=1, p_transition=.1, alphabet_size=4, order=0, dirichlet_prior_strength=None ) ) return bg_model
def create_test_data_generating_model(self, dirichlet_strength=.05): data_generating_model = hmm.as_model(self.new_model(dirichlet_strength = .05)) for k in xrange(self.K-1): data_generating_model.set_transition(self.kth(k),self.kth_gap(k),0.0) # set all gaps to 0.0 data_generating_model.set_transition(self.kth(self.K/2),self.kth_gap(self.K/2),0.5) # except for gap at midpoint for i in xrange(data_generating_model.N): data_generating_model.set_initial(i, 0.0) for i in xrange(self.num_background_mosaics): data_generating_model.set_initial(i, 1.0 / self.num_background_mosaics) data_generating_model.normalise() return data_generating_model
def information_content_per_base(self, model): model = hmm.as_model(model) emissions = self.pssm_dist(model) result = 0.0 expected_num_bases = 0.0 for k in xrange(self.K): result += hmm.pssm.base_information_content(emissions[2*k]) expected_num_bases += 1.0 if k < self.K - 1: p_gap = self.p_gap_for_model(model, k) expected_num_bases += p_gap result += p_gap * hmm.pssm.base_information_content(emissions[2*k+1]) return result / expected_num_bases
def evaluate_mosaics(max_mosaics=6, max_order=3): """ Evaluate different mosaic models on chip-chip fragments. """ from gapped_pssms import data sequences = data.training_test_sequences() mosaic_sizes = range(1,max_mosaics+1) orders = range(max_order+1) preprocessed_sequences = [ ( [hmm.preprocess_sequence(s) for s in training], [hmm.preprocess_sequence(s) for s in test] ) for training, test in sequences ] result = list() for order in orders: converter = hmm.MarkovOrderConverter(alphabet_size=4, order=order) order_n_seqs = [ ( [converter.to_order_n(s) for s in training], [converter.to_order_n(s) for s in test] ) for training, test in sequences ] for num_mosaics in mosaic_sizes: LL = 0. for training_seqs, test_seqs in order_n_seqs: model = hmm.as_model( create_mosaic_model( num_mosaics=num_mosaics, p_transition=0., alphabet_size=4, order=order, dirichlet_prior_strength=10. ) ) model.baum_welch(training_seqs) LL += sum(model.LL(s) for s in test_seqs) logging.info('Order: %d; # mosaics: %d; LL: %f', order, num_mosaics, LL) result.append((order, num_mosaics, LL)) return result
def model_for_initialisation_K_mer(self, K_mer, p_binding_site): """ Create a model initialised by this K-mer. """ emission_distributions = numpy.ones((self.K, 4)) * self.initialisation_pseudo_count for k, base in enumerate(K_mer): if 4 == base: emission_distributions[k] += 0.25 else: emission_distributions[k, base] += 1.0 gap_emissions = numpy.ones((4,)) / 4.0 pssm, in_states, out_states = self.builder.create( p_gap=0.5, non_gap_emissions=emission_distributions, gap_emissions=gap_emissions ) model = hmm.as_model( single_gap.add_to_simple_background_model( model=pssm, in_states=in_states, out_states=out_states, p_binding_site=p_binding_site ) ) return model
def write_logo(self, model, f, rev_comp=False): import hmm.pssm.logo as logo model = hmm.as_model(model) emissions = self.pssm_dist(model) transparencies = [] pssm_dist = [] for k in xrange(self.K): pssm_dist.append(emissions[2*k]) transparencies.append(1.0) if k < self.K - 1: p_gap = self.p_gap_for_model(model, k) if p_gap > self.gap_threshold: pssm_dist.append(emissions[2*k+1]) transparencies.append(p_gap) if rev_comp: pssm_dist.reverse() for i, emission in enumerate(pssm_dist): pssm_dist[i] = emission[::-1] transparencies.reverse() image = logo.pssm_as_image(pssm_dist, transparencies=transparencies) image.save(f, "PNG") return image
) emissions[builder.gap_index] = hmm.dirichlet_draw(numpy.ones(builder.M) * .3) model_by_states, in_states, out_states = builder.create( p_gap=.6, emissions=emissions ) # create a background model and add the single gapped pssm to it complete_model = add_to_simple_background_model( model_by_states, in_states, out_states, p_binding_site=.01) # convert to other type of model model = hmm.as_model(complete_model) # write as a graph hmm.graph_as_svg( model, 'single-gapped-hmm', graphing_keywords={'include_emissions':False}, neato_properties={'-Elen':2} ) # get the emissions and gap probabilities and write a logo emissions_copy, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1) assert (emissions_copy - emissions).sum() < 1e-10 import hmm.pssm.logo as logo image = logo.pssm_as_image(emissions, transparencies=gap_probs) image.save("single-gapped-pssm-logo.png", "PNG")
e=me[i*max_mosaics:(i+1)*max_mosaics] plot([x[1] for x in e], [x[2] for x in e]) xlabel('# mosaics') ylabel('LL') title('Evaluation of mosaic models of various Markov orders') savefig('mosaic-evaluation.png', format='PNG') raise # load our sequences sequences = convert_fasta_sequences(fasta_file_for_fragment('T00671')) # build our model model_by_states = create_mosaic_model( num_mosaics=1, p_transition=0., alphabet_size=4, order=2, dirichlet_prior_strength=10. ) model = hmm.as_model(model_by_states) print model.B # convert our sequences to the correct order sequences_order_n = [model.converter.to_order_n(s) for s in sequences] #from IPython.Debugger import Pdb; Pdb().set_trace() def callback(LL): logging.info('LL: %f', LL) model.baum_welch(sequences_order_n, callback=callback) print model.B
hmm.dirichlet_draw(numpy.ones(builder.M) * .1) for k in xrange(builder.K) ]) emissions[builder.gap_index] = hmm.dirichlet_draw( numpy.ones(builder.M) * .3) model_by_states, in_states, out_states = builder.create( p_gap=.6, emissions=emissions) # create a background model and add the single gapped pssm to it complete_model = add_to_simple_background_model(model_by_states, in_states, out_states, p_binding_site=.01) # convert to other type of model model = hmm.as_model(complete_model) # write as a graph hmm.graph_as_svg(model, 'single-gapped-hmm', graphing_keywords={'include_emissions': False}, neato_properties={'-Elen': 2}) # get the emissions and gap probabilities and write a logo emissions_copy, gap_probs = builder.get_emissions_and_gap_probabilities( model, offset=1) assert (emissions_copy - emissions).sum() < 1e-10 import hmm.pssm.logo as logo image = logo.pssm_as_image(emissions, transparencies=gap_probs) image.save("single-gapped-pssm-logo.png", "PNG")
for i in range(max_order): e = me[i * max_mosaics:(i + 1) * max_mosaics] plot([x[1] for x in e], [x[2] for x in e]) xlabel('# mosaics') ylabel('LL') title('Evaluation of mosaic models of various Markov orders') savefig('mosaic-evaluation.png', format='PNG') raise # load our sequences sequences = convert_fasta_sequences(fasta_file_for_fragment('T00671')) # build our model model_by_states = create_mosaic_model(num_mosaics=1, p_transition=0., alphabet_size=4, order=2, dirichlet_prior_strength=10.) model = hmm.as_model(model_by_states) print model.B # convert our sequences to the correct order sequences_order_n = [model.converter.to_order_n(s) for s in sequences] #from IPython.Debugger import Pdb; Pdb().set_trace() def callback(LL): logging.info('LL: %f', LL) model.baum_welch(sequences_order_n, callback=callback) print model.B
def get_p_binding(self, model): return 2.0 * hmm.as_model(model).A[0, self.num_background_mosaics]
def log_info(self, model, log): model = hmm.as_model(model) log.info('IC: %.3f' % hmm.pssm.information_content(self.pssm_dist(model))) log.info('p(binding site): %.6f' % self.p_binding_site_for_model(model))
def get_p_binding(self, model): return 2.0 * hmm.as_model(model).A[0,self.num_background_mosaics]
numpy.array([hmm.dirichlet_draw(numpy.ones(builder.M) * strength) for k in xrange(builder.K)]) for strength in dirichlet_prior_strengths ] gap_emissions = [hmm.dirichlet_draw(numpy.ones(builder.M) * strength) for strength in dirichlet_prior_strengths] # create out single gapped pssms pssms = [builder.create(p_gap, non_gap, gap) for non_gap, gap in zip(emissions, gap_emissions)] # create our complete models (by adding a background model) p_binding_site = exp_sites_per_sequence / L models = [ hmm.as_model( single_gap.add_to_simple_background_model( model=pssm[0], in_states=pssm[1], out_states=pssm[2], p_binding_site=p_binding_site ) ) for pssm in pssms ] # write our logos # convert to sequences and write fasta def tag(sample_idx): return "K%d-g%.2f-N%d-L%d-seed%d-%d" % (K, p_gap, N, L, seed, sample_idx) print "Writing logos" for i, model in enumerate(models): emissions, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1)
def pssm_dist(self, model): return hmm.as_model(model).B[self.num_background_mosaics:self.num_background_mosaics+self.K,:4]
def p_gap_for_model(self, model, k): return hmm.as_model(model).A[self.kth(k), self.kth_gap(k)]