def test_MLEEstimator(self):
        est = MLEEstimator()
        dapos_model = NGram(3, estimator=est)
        dapos_model.set_index(AuxiliaryIndex)

        nltk_model = NgramModel(3, self.corpus, estimator=MLEProbDist)
        phrase = 'Stop being stunned'.split()
        x = dapos_model.prob(phrase)
        y = nltk_model.prob(phrase[2], phrase[:2])
Exemple #2
0
class Inflector(object):
    """A simple inflector based on a lemma bigram model."""

    def __init__(self, training_prefix):
        l_sentences = []
        f_sentences = []
        c_sentences = []
        trees = []
        # The set of possible inflections for each lemma.
        self.inflections = defaultdict(set)
        with utf8open(training_prefix + ".lemma") as lemma_file, utf8open(
            training_prefix + ".form"
        ) as form_file, utf8open(training_prefix + ".tree") as tree_file:
            for lemma_line, form_line, tree_line in izip(lemma_file, form_file, tree_file):
                l_sentence = lemma_line.split()
                f_sentence = form_line.split()
                c_sentence = []
                for lemma, form in izip(l_sentence, f_sentence):
                    c_sentence.append("{}~{}".format(lemma, form))
                    self.inflections[lemma].add(form)
                l_sentences.append(l_sentence)
                f_sentences.append(f_sentence)
                c_sentences.append(c_sentence)
                trees.append(DepTree(tree_line))
        self.lr_model = NgramModel(2, c_sentences, pad_left=True, estimator=lidstone_estimator)
        self.dp_model = DependencyNgramModel(2, l_sentences, f_sentences, trees)

    def inflect(self, testing_prefix, dp_weight=0.5):
        """Return a list containing inflected versions of the sentences
        described by the files under *testing_prefix*."""
        lr_weight = 1 - dp_weight
        inflected = []
        with utf8open(testing_prefix + ".lemma") as lemma_file, utf8open(testing_prefix + ".tree") as tree_file:
            for lemma_line, tree_line in izip(lemma_file, tree_file):
                l_sentence = lemma_line.split()
                tree = DepTree(tree_line)
                ngrams = dep_ngrams(2, l_sentence, l_sentence, tree)  # not used here
                forms = []
                last_lemma = None
                for lemma, dep_ngram in izip(l_sentence, ngrams):
                    if not self.inflections[lemma]:
                        # We've never seen this lemma before, so just
                        # output it as-is and move on.
                        forms.append(lemma)
                        continue
                    best_form = None
                    best_score = float("-inf")
                    for form in self.inflections[lemma]:
                        if last_lemma is None:
                            context = [""]
                        else:
                            context = ["{}~{}".format(last_lemma, forms[-1])]
                        score = lr_weight * self.lr_model.prob(
                            "{}~{}".format(lemma, form), context
                        ) + dp_weight * self.dp_model.prob(form, dep_ngram[:-1])
                        if score > best_score:
                            best_form = form
                            best_score = score
                    forms.append(best_form)
                    last_lemma = lemma
                inflected.append(" ".join(forms))
        return inflected