def test_MLEEstimator(self): est = MLEEstimator() dapos_model = NGram(3, estimator=est) dapos_model.set_index(AuxiliaryIndex) nltk_model = NgramModel(3, self.corpus, estimator=MLEProbDist) phrase = 'Stop being stunned'.split() x = dapos_model.prob(phrase) y = nltk_model.prob(phrase[2], phrase[:2])
class Inflector(object): """A simple inflector based on a lemma bigram model.""" def __init__(self, training_prefix): l_sentences = [] f_sentences = [] c_sentences = [] trees = [] # The set of possible inflections for each lemma. self.inflections = defaultdict(set) with utf8open(training_prefix + ".lemma") as lemma_file, utf8open( training_prefix + ".form" ) as form_file, utf8open(training_prefix + ".tree") as tree_file: for lemma_line, form_line, tree_line in izip(lemma_file, form_file, tree_file): l_sentence = lemma_line.split() f_sentence = form_line.split() c_sentence = [] for lemma, form in izip(l_sentence, f_sentence): c_sentence.append("{}~{}".format(lemma, form)) self.inflections[lemma].add(form) l_sentences.append(l_sentence) f_sentences.append(f_sentence) c_sentences.append(c_sentence) trees.append(DepTree(tree_line)) self.lr_model = NgramModel(2, c_sentences, pad_left=True, estimator=lidstone_estimator) self.dp_model = DependencyNgramModel(2, l_sentences, f_sentences, trees) def inflect(self, testing_prefix, dp_weight=0.5): """Return a list containing inflected versions of the sentences described by the files under *testing_prefix*.""" lr_weight = 1 - dp_weight inflected = [] with utf8open(testing_prefix + ".lemma") as lemma_file, utf8open(testing_prefix + ".tree") as tree_file: for lemma_line, tree_line in izip(lemma_file, tree_file): l_sentence = lemma_line.split() tree = DepTree(tree_line) ngrams = dep_ngrams(2, l_sentence, l_sentence, tree) # not used here forms = [] last_lemma = None for lemma, dep_ngram in izip(l_sentence, ngrams): if not self.inflections[lemma]: # We've never seen this lemma before, so just # output it as-is and move on. forms.append(lemma) continue best_form = None best_score = float("-inf") for form in self.inflections[lemma]: if last_lemma is None: context = [""] else: context = ["{}~{}".format(last_lemma, forms[-1])] score = lr_weight * self.lr_model.prob( "{}~{}".format(lemma, form), context ) + dp_weight * self.dp_model.prob(form, dep_ngram[:-1]) if score > best_score: best_form = form best_score = score forms.append(best_form) last_lemma = lemma inflected.append(" ".join(forms)) return inflected