Esempio n. 1
0
    def test_language_models_on_brown_corpus(self):
        corpus = BrownCorpus()
        unigram_language_model = UnigramLanguageModel(corpus)
        bigram_language_model = BigramLanguageModel(corpus)
        bigram_language_model_smoothing = NgramLanguageModel(
            corpus, 2, LaplaceSmoothingNgramProbabilityCalculator)
        trigram_language_model = NgramLanguageModel(
            corpus, 3, BackoffNgramProbabilityCalculator)
        trigram_language_model_interpolating = NgramLanguageModel(
            corpus, 3, InterpolatingNgramProbabilityCalculator)
        quadrigram_language_model = NgramLanguageModel(
            corpus, 4, BackoffNgramProbabilityCalculator)

        assert 1548.66 == round(
            self.calculator.calculate_corpus_perplexity(
                unigram_language_model, corpus), 2)
        assert 100.10 == round(
            self.calculator.calculate_corpus_perplexity(
                bigram_language_model, corpus), 2)
        assert 4659.28 == round(
            self.calculator.calculate_corpus_perplexity(
                bigram_language_model_smoothing, corpus), 2)
        assert 7.77 == round(
            self.calculator.calculate_corpus_perplexity(
                trigram_language_model, corpus), 2)
        assert 17.56 == round(
            self.calculator.calculate_corpus_perplexity(
                trigram_language_model_interpolating, corpus), 2)
        assert 2.40 == round(
            self.calculator.calculate_corpus_perplexity(
                quadrigram_language_model, corpus), 2)
Esempio n. 2
0
    def test_language_models_on_reuters_corpus(self):
        reuters = ReutersTrainingCorpus().get_sentences()
        slice_index = round(len(reuters) * .9)
        training = Corpus(reuters[:slice_index])
        test = Corpus(reuters[slice_index:])

        unigram_language_model = UnigramLanguageModel(training)
        bigram_language_model = NgramLanguageModel(
            training, 2, BackoffNgramProbabilityCalculator)
        trigram_language_model = NgramLanguageModel(
            training, 3, BackoffNgramProbabilityCalculator)
        trigram_language_model_interpolating = NgramLanguageModel(
            training, 3, InterpolatingNgramProbabilityCalculator)
        quadrigram_language_model = NgramLanguageModel(
            training, 4, BackoffNgramProbabilityCalculator)

        assert 1021.63 == round(
            self.calculator.calculate_corpus_perplexity(
                unigram_language_model, test), 2)
        assert 176.52 == round(
            self.calculator.calculate_corpus_perplexity(
                bigram_language_model, test), 2)
        assert 171.96 == round(
            self.calculator.calculate_corpus_perplexity(
                trigram_language_model, test), 2)
        assert 189.32 == round(
            self.calculator.calculate_corpus_perplexity(
                trigram_language_model_interpolating, test), 2)
        assert 298.98 == round(
            self.calculator.calculate_corpus_perplexity(
                quadrigram_language_model, test), 2)
Esempio n. 3
0
    def test_language_models_on_somewhat_simple_corpus(self):
        training = Corpus([['a', 'b', 'c'], ['b', 'c', 'd'], ['c', 'd', 'e']])
        test = Corpus([['a', 'b', 'c'], ['a', 'c', 'd']])
        unigram_language_model = UnigramLanguageModel(training)
        bigram_language_model = BigramLanguageModel(training)
        bigram_language_model_smoothing = NgramLanguageModel(
            training, 2, LaplaceSmoothingNgramProbabilityCalculator)
        trigram_language_model = NgramLanguageModel(
            training, 3, NgramProbabilityCalculator)
        trigram_language_model_backoff = NgramLanguageModel(
            training, 3, BackoffNgramProbabilityCalculator)
        trigram_language_model_interpolating = NgramLanguageModel(
            training, 3, InterpolatingNgramProbabilityCalculator)

        assert 4.95 == round(
            self.calculator.calculate_corpus_perplexity(
                unigram_language_model, test), 2)
        assert 1.68 == round(
            self.calculator.calculate_corpus_perplexity(
                bigram_language_model, test), 2)
        assert 3.49 == round(
            self.calculator.calculate_corpus_perplexity(
                bigram_language_model_smoothing, test), 2)
        assert 1.73 == round(
            self.calculator.calculate_corpus_perplexity(
                trigram_language_model, test), 2)
        assert 2.93 == round(
            self.calculator.calculate_corpus_perplexity(
                trigram_language_model_backoff, test), 2)
        assert 3.04 == round(
            self.calculator.calculate_corpus_perplexity(
                trigram_language_model_interpolating, test), 2)
Esempio n. 4
0
    def find_lambdas_max_estimation_multiple_slicings(self, corpus, n, slicings):
        sentences = corpus.get_sentences()
        slice_size = round(len(sentences) * (1/slicings))

        guesses = []

        for i in range(slicings):
            slice_index = round(slice_size * i)
            slice_end = slice_index + slice_size
            training = Corpus(sentences[:slice_index] + sentences[slice_end:])
            holdout = Corpus(sentences[slice_index:slice_end])
            best_lambdas = InterpolationTrainer().find_lambdas_max_estimation(training, holdout, 3)
            guesses.append(best_lambdas)

        perplexities = [0] * slicings
        for slicing in range(slicings):
            slice_index = round(slice_size * i)
            slice_end = slice_index + slice_size
            training = Corpus(sentences[:slice_index] + sentences[slice_end:])
            holdout = Corpus(sentences[slice_index:slice_end])
            for guess in range(slicings):
                calculator = InterpolatingNgramProbabilityCalculator(NgramCalculatorContainer(training, 3), guesses[i])
                max_estimation_language_model = NgramLanguageModel(training, 3, ngram_probability_calculator=calculator)
                perplexity = PerplexityCalculator().calculate_corpus_perplexity(max_estimation_language_model, holdout)
                perplexities[guess] += perplexity

        return guesses[perplexities.index(min(perplexities))]
Esempio n. 5
0
def q6_minimize_perplexity_of_reuters_corpus():
    training = ReutersTrainingCorpus()
    test = ReutersTestCorpus()
    lambdas = [0.2518, 0.4691, 0.2791]
    calculator = InterpolatingNgramProbabilityCalculator(
        NgramCalculatorContainer(training, 3), lambdas)
    language_model = NgramLanguageModel(
        training, 3, ngram_probability_calculator=calculator)
    perplexity = PerplexityCalculator().calculate_corpus_perplexity(
        language_model, test)

    print("Test perplexity best model: ", perplexity)
Esempio n. 6
0
    def test_language_models_on_very_simple_corpus(self):
        corpus = Corpus([['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']])
        unigram_language_model = UnigramLanguageModel(corpus)
        bigram_language_model = BigramLanguageModel(corpus)
        trigram_language_model = NgramLanguageModel(
            corpus, 3, NgramProbabilityCalculator)

        assert 10 == round(
            self.calculator.calculate_corpus_perplexity(
                unigram_language_model, corpus), 10)
        assert 1 == round(
            self.calculator.calculate_corpus_perplexity(
                bigram_language_model, corpus), 10)
        assert 1 == round(
            self.calculator.calculate_corpus_perplexity(
                trigram_language_model, corpus), 10)
Esempio n. 7
0
def q7_perplexity_oov_of_brown_corpus():
    training = ReutersTrainingCorpus()
    brown = BrownCorpus()
    lambdas = [0.2518, 0.4691, 0.2791]
    calculator = InterpolatingNgramProbabilityCalculator(
        NgramCalculatorContainer(training, 3), lambdas)
    language_model = NgramLanguageModel(
        training, 3, ngram_probability_calculator=calculator)
    perplexity = PerplexityCalculator().calculate_corpus_perplexity(
        language_model, brown)

    oov_calculator = OutOfVocabularyRateCalculator()
    oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, brown)

    print("Brown perplexity best model: ", perplexity)
    print("Brown out of vocabulary rate: ", oov_rate)
    def test_trainer_on_reuters_corpus(self):
        reuters = ReutersTrainingCorpus().get_sentences()
        slice_size = round(len(reuters) * .1)

        guesses = []

        for i in range(10):
            slice_index = round(slice_size * i)
            slice_end = slice_index + slice_size
            training = Corpus(reuters[:slice_index] + reuters[slice_end:])
            holdout = Corpus(reuters[slice_index:slice_end])
            best_lambdas = InterpolationTrainer().find_lambdas_max_estimation(
                training, holdout, 3)
            guesses.append(best_lambdas)

        perplexities = [0] * 10
        for slicing in range(10):
            slice_index = round(slice_size * i)
            slice_end = slice_index + slice_size
            training = Corpus(reuters[:slice_index] + reuters[slice_end:])
            holdout = Corpus(reuters[slice_index:slice_end])
            for guess in range(10):
                calculator = InterpolatingNgramProbabilityCalculator(
                    NgramCalculatorContainer(training, 3), guesses[guess])
                max_estimation_language_model = NgramLanguageModel(
                    training, 3, ngram_probability_calculator=calculator)
                perplexity = PerplexityCalculator(
                ).calculate_corpus_perplexity(max_estimation_language_model,
                                              holdout)
                perplexities[guess] += perplexity

        min_perplexity = min(perplexities)
        best_lambdas = guesses[perplexities.index(min_perplexity)]

        for i in range(10):
            print(perplexities[i] / 10, guesses[i])

        print(min_perplexity, best_lambdas)
Esempio n. 9
0
    def find_lambdas_brute_force(self, training_corpus, holdout_corpus, n):
        ngram_counter = NgramCalculatorContainer(training_corpus, n)
        minimum_perplexity = sys.maxsize
        best_lambdas = []
        perplexity_calculator = PerplexityCalculator()
        increment = .1
        tries = round(1 / increment)

        for i in range(tries + 1):
            l1 = round(increment * i, 2)
            for j in range(tries-i):
                l2 = round(increment * j, 2)
                l3 = round(1 - l1 - l2, 2)
                lambdas = [l1, l2, l3]
                ngram_probability_calculator = InterpolatingNgramProbabilityCalculator(ngram_counter, lambdas)
                language_model = NgramLanguageModel(
                    training_corpus, n, ngram_probability_calculator=ngram_probability_calculator
                )
                perplexity = perplexity_calculator.calculate_corpus_perplexity(language_model, holdout_corpus)
                if perplexity < minimum_perplexity:
                    minimum_perplexity = perplexity
                    best_lambdas = lambdas

        return best_lambdas, minimum_perplexity