Exemple #1
0
 def prior(self, model):
     tmp_model = hmm.as_model(self.new_model( generate_uniform=True ))
     prior = hmm.ModelPrior(self.N(), self.M())
     prior.A = tmp_model.A * self.prior_strength
     prior.B = tmp_model.B * self.prior_strength
     prior.pi = tmp_model.pi * self.prior_strength
     return prior
Exemple #2
0
 def load(self, index):
     order, num_mosaics, fragment = index
     filename = self.path_for(index)
     if not os.access(filename, os.R_OK):
         raise RuntimeError('Have no model for %s' % self.index_as_string(index))
     builder = hmm.pssm.ModelBuilder(order)
     return hmm.as_model(builder.load_background_mosaic_model(filename))
Exemple #3
0
def evaluate_mosaics(max_mosaics=6, max_order=3):
    """
    Evaluate different mosaic models on chip-chip fragments.
    """
    from gapped_pssms import data
    sequences = data.training_test_sequences()
    mosaic_sizes = range(1, max_mosaics + 1)
    orders = range(max_order + 1)
    preprocessed_sequences = [([hmm.preprocess_sequence(s) for s in training],
                               [hmm.preprocess_sequence(s) for s in test])
                              for training, test in sequences]
    result = list()
    for order in orders:
        converter = hmm.MarkovOrderConverter(alphabet_size=4, order=order)
        order_n_seqs = [([converter.to_order_n(s) for s in training],
                         [converter.to_order_n(s) for s in test])
                        for training, test in sequences]
        for num_mosaics in mosaic_sizes:
            LL = 0.
            for training_seqs, test_seqs in order_n_seqs:
                model = hmm.as_model(
                    create_mosaic_model(num_mosaics=num_mosaics,
                                        p_transition=0.,
                                        alphabet_size=4,
                                        order=order,
                                        dirichlet_prior_strength=10.))
                model.baum_welch(training_seqs)
                LL += sum(model.LL(s) for s in test_seqs)
            logging.info('Order: %d; # mosaics: %d; LL: %f', order,
                         num_mosaics, LL)
            result.append((order, num_mosaics, LL))
    return result
def learn_bg_model(
  sequences,
  num_mosaics=4,
  order=3,
  tolerance_per_base=7e-5
):
    """
    @return: (bg_model, converted_sequences)
    """
    bg_model = hmm.as_model(
      create_mosaic_model(
        num_mosaics=4,
        p_transition=.1,
        alphabet_size=4,
        order=3,
        dirichlet_prior_strength=.3
      )
    )
    converted_seqs = [bg_model.converter.to_order_n(s) for s in sequences]
    def _callback(LL):
        logging.debug('Background model LL: %f', LL)
        return True
    tolerance=tolerance_per_base*sum(len(s) for s in sequences)
    logging.info('Learning background model with %d mosaics of order %d, tolerance=%f', num_mosaics, order, tolerance)
    start = time.time()
    LL, iterations = bg_model.baum_welch(converted_seqs, tolerance=tolerance, callback=_callback)
    logging.info('Achieved LL=%f after %d iterations and %f seconds', LL, iterations, time.time() - start)
    return bg_model, converted_seqs
    def _model_for_L_mer(self, L_mer, gap_index, p_binding_site):
        """
        Create a model initialised by this K-mer.
        """
        # get the start position of the K-mer and a builder to make the model
        mer_len = len(L_mer)
        start, builder = self._make_builder(gap_index, mer_len)

        # get the emission distribution
        nucleo_dist = nucleo_dist_from_mer(
          seq_to_numpy(L_mer),
          self.options.pseudo_count_for_model_initialisation,
          gap_index=gap_index
        )
        emissions = numpy.ones((self.options.K,4))/4.
        emissions[start:start+mer_len+1] = nucleo_dist

        # build the model
        pssm, in_states, out_states = builder.create(
          p_gap=.5,
          emissions=emissions
        )
        model = hmm.as_model(
          single_gap.add_to_simple_background_model(
            model=pssm,
            in_states=in_states,
            out_states=out_states,
            p_binding_site=p_binding_site
          )
        )
        #print model.A
        #from IPython.Debugger import Pdb; Pdb().set_trace();
        return model
Exemple #6
0
 def create_test_data_generating_model(self, dirichlet_strength=.05):
     data_generating_model = hmm.as_model(self.new_model(dirichlet_strength = .05))
     for i in xrange(data_generating_model.N):
         data_generating_model.set_initial(i, 0.0)
     for i in xrange(self.num_background_mosaics):
         data_generating_model.set_initial(i, 1.0 / self.num_background_mosaics)
     data_generating_model.normalise()
     return data_generating_model
def build_hmm_model(freqs, gaps, p_binding_site=.001):
    "@return: A hmm.Model representing the gapped PWM defined by the arguments."
    model_by_states = build_model_by_states(freqs,
                                            gaps,
                                            p_binding_site=p_binding_site)
    model = hmm.as_model(model_by_states)
    model.normalise()
    return model
Exemple #8
0
 def log_info(self, model, log):
     model = hmm.as_model(model)
     BaseTraits.log_info(self, model, log)
     log.info('Adjusted IC/base = %.4f' % self.information_content_per_base(model))
     A = model.A
     for k in xrange(self.K):
         p_gap = self.p_gap_for_model(model, k)
         if p_gap > self.gap_threshold:
             log.info('p(gap at %d) = %.4f' % (k, p_gap))
    def test_traits(self):
        from hmm.pssm import create_background_model, seq_to_numpy
        from infpy import check_is_close_2

        for order in [0, 1, 2]:
            num_background_mosaics = 4
            traits = hmm.pssm.GappedPssmTraits(
                    K=7,
                    p_binding_site=.1,
                    background_order=order,
                    num_background_mosaics=num_background_mosaics,
                    background_model_creator=create_background_model
            )
            emission_dists = [
              [ 1., 0., 0., 0. ],
              [ 0., 1., 0., 0. ],
              [ 0., 1., 0., 0. ],
              [ 1., 0., 0., 0. ],
              [ 0., 0., 1., 0. ],
              [ 0., 0., 0., 1. ],
              [ 0., 0., 0., 1. ],
              [ 0., 0., 0., 1. ],
              [ 0., 0., 1., 0. ],
              [ 0., 1., 0., 0. ],
              [ 1., 0., 0., 0. ],
              [ 0., 1., 0., 0. ],
              [ 0., 0., 0., 1. ],
            ]
            m=hmm.as_model(traits.new_model(emission_dists))

            # check we have parameterised the reverse complement gaps correctly
            assert m.get_transition_parameterisation(29,28).idx == m.get_transition_parameterisation(14,15).idx
            assert m.get_transition_parameterisation(29,28).idx != m.get_transition_parameterisation(4,5).idx

            # check the reverse complement states are correct
            B = m.B
            for n in xrange(m.N):
                for o in xrange(m.M):
                    rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(n,o)
                    assert check_is_close_2(
                            B[rev_comp_state,rev_comp_obs],
                            B[n,o]
                    ), (
                            '%d,%d %d,%d: %f %f' % (
                                    rev_comp_state,rev_comp_obs,n,o,B[rev_comp_state,rev_comp_obs],B[n,o]
                            )
                    )

            # check viterbi gives correct result
            test_seq = 'acgtgat' # matches dist above
            test_seq_order_0 = hmm.pssm.seq_to_numpy(test_seq)
            test_seq_order_n = m.converter.to_order_n(test_seq_order_0)
            LL, states = m.viterbi(test_seq_order_n)
            for i, state in enumerate(states):
                assert (state-num_background_mosaics)/2 == i
    def test_traits(self):
        from hmm.pssm import create_background_model, seq_to_numpy
        from infpy import check_is_close_2

        for order in [0, 1, 2]:
            num_background_mosaics = 4
            traits = hmm.pssm.GappedPssmTraits(
                K=7,
                p_binding_site=.1,
                background_order=order,
                num_background_mosaics=num_background_mosaics,
                background_model_creator=create_background_model)
            emission_dists = [
                [1., 0., 0., 0.],
                [0., 1., 0., 0.],
                [0., 1., 0., 0.],
                [1., 0., 0., 0.],
                [0., 0., 1., 0.],
                [0., 0., 0., 1.],
                [0., 0., 0., 1.],
                [0., 0., 0., 1.],
                [0., 0., 1., 0.],
                [0., 1., 0., 0.],
                [1., 0., 0., 0.],
                [0., 1., 0., 0.],
                [0., 0., 0., 1.],
            ]
            m = hmm.as_model(traits.new_model(emission_dists))

            # check we have parameterised the reverse complement gaps correctly
            assert m.get_transition_parameterisation(
                29, 28).idx == m.get_transition_parameterisation(14, 15).idx
            assert m.get_transition_parameterisation(
                29, 28).idx != m.get_transition_parameterisation(4, 5).idx

            # check the reverse complement states are correct
            B = m.B
            for n in xrange(m.N):
                for o in xrange(m.M):
                    rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(
                        n, o)
                    assert check_is_close_2(
                        B[rev_comp_state, rev_comp_obs],
                        B[n, o]), ('%d,%d %d,%d: %f %f' %
                                   (rev_comp_state, rev_comp_obs, n, o,
                                    B[rev_comp_state, rev_comp_obs], B[n, o]))

            # check viterbi gives correct result
            test_seq = 'acgtgat'  # matches dist above
            test_seq_order_0 = hmm.pssm.seq_to_numpy(test_seq)
            test_seq_order_n = m.converter.to_order_n(test_seq_order_0)
            LL, states = m.viterbi(test_seq_order_n)
            for i, state in enumerate(states):
                assert (state - num_background_mosaics) / 2 == i
def uniform_bg_model():
    bg_model = hmm.as_model(
      create_mosaic_model(
        num_mosaics=1,
        p_transition=.1,
        alphabet_size=4,
        order=0,
        dirichlet_prior_strength=None
      )
    )
    return bg_model
Exemple #12
0
 def create_test_data_generating_model(self, dirichlet_strength=.05):
     data_generating_model = hmm.as_model(self.new_model(dirichlet_strength = .05))
     for k in xrange(self.K-1):
         data_generating_model.set_transition(self.kth(k),self.kth_gap(k),0.0) # set all gaps to 0.0
     data_generating_model.set_transition(self.kth(self.K/2),self.kth_gap(self.K/2),0.5) # except for gap at midpoint
     for i in xrange(data_generating_model.N):
         data_generating_model.set_initial(i, 0.0)
     for i in xrange(self.num_background_mosaics):
         data_generating_model.set_initial(i, 1.0 / self.num_background_mosaics)
     data_generating_model.normalise()
     return data_generating_model
Exemple #13
0
 def information_content_per_base(self, model):
     model = hmm.as_model(model)
     emissions = self.pssm_dist(model)
     result = 0.0
     expected_num_bases = 0.0
     for k in xrange(self.K):
         result += hmm.pssm.base_information_content(emissions[2*k])
         expected_num_bases += 1.0
         if k < self.K - 1:
             p_gap = self.p_gap_for_model(model, k)
             expected_num_bases += p_gap
             result += p_gap * hmm.pssm.base_information_content(emissions[2*k+1])
     return result / expected_num_bases
Exemple #14
0
def evaluate_mosaics(max_mosaics=6, max_order=3):
    """
    Evaluate different mosaic models on chip-chip fragments.
    """
    from gapped_pssms import data
    sequences = data.training_test_sequences()
    mosaic_sizes = range(1,max_mosaics+1)
    orders = range(max_order+1)
    preprocessed_sequences = [
      (
        [hmm.preprocess_sequence(s) for s in training],
        [hmm.preprocess_sequence(s) for s in test]
      )
      for training, test in sequences
    ]
    result = list()
    for order in orders:
        converter = hmm.MarkovOrderConverter(alphabet_size=4, order=order)
        order_n_seqs = [
          (
            [converter.to_order_n(s) for s in training],
            [converter.to_order_n(s) for s in test]
          )
          for training, test in sequences
        ]
        for num_mosaics in mosaic_sizes:
            LL = 0.
            for training_seqs, test_seqs in order_n_seqs:
                model = hmm.as_model(
                  create_mosaic_model(
                    num_mosaics=num_mosaics,
                    p_transition=0.,
                    alphabet_size=4,
                    order=order,
                    dirichlet_prior_strength=10.
                  )
                )
                model.baum_welch(training_seqs)
                LL += sum(model.LL(s) for s in test_seqs)
            logging.info('Order: %d; # mosaics: %d; LL: %f', order, num_mosaics, LL)
            result.append((order, num_mosaics, LL))
    return result
Exemple #15
0
 def model_for_initialisation_K_mer(self, K_mer, p_binding_site):
     """
     Create a model initialised by this K-mer.
     """
     emission_distributions = numpy.ones((self.K, 4)) * self.initialisation_pseudo_count
     for k, base in enumerate(K_mer):
         if 4 == base:
             emission_distributions[k] += 0.25
         else:
             emission_distributions[k, base] += 1.0
     gap_emissions = numpy.ones((4,)) / 4.0
     pssm, in_states, out_states = self.builder.create(
         p_gap=0.5, non_gap_emissions=emission_distributions, gap_emissions=gap_emissions
     )
     model = hmm.as_model(
         single_gap.add_to_simple_background_model(
             model=pssm, in_states=in_states, out_states=out_states, p_binding_site=p_binding_site
         )
     )
     return model
Exemple #16
0
 def write_logo(self, model, f, rev_comp=False):
     import hmm.pssm.logo as logo
     model = hmm.as_model(model)
     emissions = self.pssm_dist(model)
     transparencies = []
     pssm_dist = []
     for k in xrange(self.K):
         pssm_dist.append(emissions[2*k])
         transparencies.append(1.0)
         if k < self.K - 1:
             p_gap = self.p_gap_for_model(model, k)
             if p_gap > self.gap_threshold:
                 pssm_dist.append(emissions[2*k+1])
                 transparencies.append(p_gap)
     if rev_comp:
         pssm_dist.reverse()
         for i, emission in enumerate(pssm_dist):
             pssm_dist[i] = emission[::-1]
         transparencies.reverse()
     image = logo.pssm_as_image(pssm_dist, transparencies=transparencies)
     image.save(f, "PNG")
     return image
Exemple #17
0
    )
    emissions[builder.gap_index] = hmm.dirichlet_draw(numpy.ones(builder.M) * .3)
    model_by_states, in_states, out_states = builder.create(
      p_gap=.6,
      emissions=emissions
    )

    # create a background model and add the single gapped pssm to it
    complete_model = add_to_simple_background_model(
      model_by_states,
      in_states,
      out_states,
      p_binding_site=.01)

    # convert to other type of model
    model = hmm.as_model(complete_model)

    # write as a graph
    hmm.graph_as_svg(
      model,
      'single-gapped-hmm',
      graphing_keywords={'include_emissions':False},
      neato_properties={'-Elen':2}
    )

    # get the emissions and gap probabilities and write a logo
    emissions_copy, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1)
    assert (emissions_copy - emissions).sum() < 1e-10
    import hmm.pssm.logo as logo
    image = logo.pssm_as_image(emissions, transparencies=gap_probs)
    image.save("single-gapped-pssm-logo.png", "PNG")
Exemple #18
0
def build_hmm_model(freqs, gaps, p_binding_site=.001):
    "@return: A hmm.Model representing the gapped PWM defined by the arguments."
    model_by_states = build_model_by_states(freqs, gaps, p_binding_site=p_binding_site)
    model = hmm.as_model(model_by_states)
    model.normalise()
    return model
Exemple #19
0
        e=me[i*max_mosaics:(i+1)*max_mosaics]
        plot([x[1] for x in e], [x[2] for x in e])
    xlabel('# mosaics')
    ylabel('LL')
    title('Evaluation of mosaic models of various Markov orders')
    savefig('mosaic-evaluation.png', format='PNG')
    raise

    # load our sequences
    sequences = convert_fasta_sequences(fasta_file_for_fragment('T00671'))

    # build our model
    model_by_states = create_mosaic_model(
      num_mosaics=1,
      p_transition=0.,
      alphabet_size=4,
      order=2,
      dirichlet_prior_strength=10.
    )
    model = hmm.as_model(model_by_states)
    print model.B

    # convert our sequences to the correct order
    sequences_order_n = [model.converter.to_order_n(s) for s in sequences]

    #from IPython.Debugger import Pdb; Pdb().set_trace()
    def callback(LL):
        logging.info('LL: %f', LL)
    model.baum_welch(sequences_order_n, callback=callback)
    print model.B
Exemple #20
0
        hmm.dirichlet_draw(numpy.ones(builder.M) * .1)
        for k in xrange(builder.K)
    ])
    emissions[builder.gap_index] = hmm.dirichlet_draw(
        numpy.ones(builder.M) * .3)
    model_by_states, in_states, out_states = builder.create(
        p_gap=.6, emissions=emissions)

    # create a background model and add the single gapped pssm to it
    complete_model = add_to_simple_background_model(model_by_states,
                                                    in_states,
                                                    out_states,
                                                    p_binding_site=.01)

    # convert to other type of model
    model = hmm.as_model(complete_model)

    # write as a graph
    hmm.graph_as_svg(model,
                     'single-gapped-hmm',
                     graphing_keywords={'include_emissions': False},
                     neato_properties={'-Elen': 2})

    # get the emissions and gap probabilities and write a logo
    emissions_copy, gap_probs = builder.get_emissions_and_gap_probabilities(
        model, offset=1)
    assert (emissions_copy - emissions).sum() < 1e-10
    import hmm.pssm.logo as logo
    image = logo.pssm_as_image(emissions, transparencies=gap_probs)
    image.save("single-gapped-pssm-logo.png", "PNG")
Exemple #21
0
    for i in range(max_order):
        e = me[i * max_mosaics:(i + 1) * max_mosaics]
        plot([x[1] for x in e], [x[2] for x in e])
    xlabel('# mosaics')
    ylabel('LL')
    title('Evaluation of mosaic models of various Markov orders')
    savefig('mosaic-evaluation.png', format='PNG')
    raise

    # load our sequences
    sequences = convert_fasta_sequences(fasta_file_for_fragment('T00671'))

    # build our model
    model_by_states = create_mosaic_model(num_mosaics=1,
                                          p_transition=0.,
                                          alphabet_size=4,
                                          order=2,
                                          dirichlet_prior_strength=10.)
    model = hmm.as_model(model_by_states)
    print model.B

    # convert our sequences to the correct order
    sequences_order_n = [model.converter.to_order_n(s) for s in sequences]

    #from IPython.Debugger import Pdb; Pdb().set_trace()
    def callback(LL):
        logging.info('LL: %f', LL)

    model.baum_welch(sequences_order_n, callback=callback)
    print model.B
Exemple #22
0
 def get_p_binding(self, model):
     return 2.0 * hmm.as_model(model).A[0, self.num_background_mosaics]
Exemple #23
0
 def log_info(self, model, log):
     model = hmm.as_model(model)
     log.info('IC: %.3f' %
              hmm.pssm.information_content(self.pssm_dist(model)))
     log.info('p(binding site): %.6f' %
              self.p_binding_site_for_model(model))
Exemple #24
0
 def get_p_binding(self, model):
     return 2.0 * hmm.as_model(model).A[0,self.num_background_mosaics]
Exemple #25
0
 def log_info(self, model, log):
     model = hmm.as_model(model)
     log.info('IC: %.3f' % hmm.pssm.information_content(self.pssm_dist(model)))
     log.info('p(binding site): %.6f' % self.p_binding_site_for_model(model))
Exemple #26
0
    numpy.array([hmm.dirichlet_draw(numpy.ones(builder.M) * strength) for k in xrange(builder.K)])
    for strength in dirichlet_prior_strengths
]
gap_emissions = [hmm.dirichlet_draw(numpy.ones(builder.M) * strength) for strength in dirichlet_prior_strengths]


# create out single gapped pssms
pssms = [builder.create(p_gap, non_gap, gap) for non_gap, gap in zip(emissions, gap_emissions)]


# create our complete models (by adding a background model)
p_binding_site = exp_sites_per_sequence / L
models = [
    hmm.as_model(
        single_gap.add_to_simple_background_model(
            model=pssm[0], in_states=pssm[1], out_states=pssm[2], p_binding_site=p_binding_site
        )
    )
    for pssm in pssms
]


# write our logos
# convert to sequences and write fasta
def tag(sample_idx):
    return "K%d-g%.2f-N%d-L%d-seed%d-%d" % (K, p_gap, N, L, seed, sample_idx)


print "Writing logos"
for i, model in enumerate(models):
    emissions, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1)
Exemple #27
0
 def pssm_dist(self, model):
     return hmm.as_model(model).B[self.num_background_mosaics:self.num_background_mosaics+self.K,:4]
Exemple #28
0
 def p_gap_for_model(self, model, k):
     return hmm.as_model(model).A[self.kth(k), self.kth_gap(k)]