class PoissonUnigramPattern: def __init__(self, K, morpheme_prior, gamma, delta, pattern_vocabulary): self.morpheme_model = DirichletMultinomial(K-2, morpheme_prior) # -START, -STOP self.length_model = GammaPoisson(gamma, delta) self.vocabulary = pattern_vocabulary def increment(self, pattern): morphemes = self.vocabulary[pattern] for morpheme in morphemes: self.morpheme_model.increment(morpheme-2) self.length_model.increment(len(morphemes)-1) def decrement(self, pattern): morphemes = self.vocabulary[pattern] for morpheme in morphemes: self.morpheme_model.decrement(morpheme-2) self.length_model.decrement(len(morphemes)-1) def prob(self, pattern): morphemes = self.vocabulary[pattern] return (prod(self.morpheme_model.prob(m) for m in morphemes) * self.length_model.prob(len(morphemes)-1)) def log_likelihood(self, full=False): return (self.morpheme_model.log_likelihood(full) + self.length_model.log_likelihood(full)) def resample_hyperparemeters(self, n_iter): return self.morpheme_model.resample_hyperparemeters(n_iter) def __repr__(self): return ('PoissonUnigram(length ~ {self.length_model},' ' morph ~ {self.morpheme_model})').format(self=self)
def sample_topics(doc, model, n_iter): assignments = [None] * len(doc) doc_topic = DirichletMultinomial(model.n_topics, model.alpha) for it in xrange(n_iter): for i, word in enumerate(doc): if it > 0: doc_topic.decrement(assignments[i]) assignments[i] = mult_sample((k, (doc_topic.prob(k) * model.topic_word[k].prob(word))) for k in xrange(model.n_topics)) doc_topic.increment(assignments[i]) return topic_vector(doc_topic, model)