class UniformUnigramPattern: def __init__(self, K, gamma, delta, pattern_vocabulary): self.morpheme_model = Uniform(K-3) # -START, -STOP, -STEM self.length_model = GammaPoisson(gamma, delta) self.vocabulary = pattern_vocabulary def increment(self, pattern): n_morphemes = len(self.vocabulary[pattern]) self.morpheme_model.count += n_morphemes-1 self.length_model.increment(n_morphemes-1) def decrement(self, pattern): n_morphemes = len(self.vocabulary[pattern]) self.morpheme_model.count -= n_morphemes-1 self.length_model.decrement(n_morphemes-1) def prob(self, pattern): n_morphemes = len(self.vocabulary[pattern]) morpheme_prob = 1./self.morpheme_model.K return (morpheme_prob**(n_morphemes-1) * self.length_model.prob(n_morphemes-1)) def log_likelihood(self, full=False): return (self.morpheme_model.log_likelihood(full) + self.length_model.log_likelihood(full)) def resample_hyperparemeters(self, n_iter): return self.morpheme_model.resample_hyperparemeters(n_iter) def __repr__(self): return ('UniformUnigram(length ~ {self.length_model},' ' morph ~ {self.morpheme_model})').format(self=self)
def main(): train = "../data/Verne.80jours.en" # train = "../data/simplewiki-20140903-pages-articles.200000first.100000last.txt" # train = "../data/wsj.words" order = 3 n_iter = 100 vocabulary = Vocabulary() logging.info('Reading training corpus') with open(train) as train: training_corpus = read_corpus(train, vocabulary) base = Uniform(len(vocabulary)) model = PYPLM(order, base) logging.info('Training model of order %d', order) run_sampler(model, training_corpus, n_iter)
def __init__(self, K, gamma, delta, pattern_vocabulary): self.morpheme_model = Uniform(K-3) # -START, -STOP, -STEM self.length_model = GammaPoisson(gamma, delta) self.vocabulary = pattern_vocabulary