def test_language_models_on_brown_corpus(self): corpus = BrownCorpus() unigram_language_model = UnigramLanguageModel(corpus) bigram_language_model = BigramLanguageModel(corpus) bigram_language_model_smoothing = NgramLanguageModel( corpus, 2, LaplaceSmoothingNgramProbabilityCalculator) trigram_language_model = NgramLanguageModel( corpus, 3, BackoffNgramProbabilityCalculator) trigram_language_model_interpolating = NgramLanguageModel( corpus, 3, InterpolatingNgramProbabilityCalculator) quadrigram_language_model = NgramLanguageModel( corpus, 4, BackoffNgramProbabilityCalculator) assert 1548.66 == round( self.calculator.calculate_corpus_perplexity( unigram_language_model, corpus), 2) assert 100.10 == round( self.calculator.calculate_corpus_perplexity( bigram_language_model, corpus), 2) assert 4659.28 == round( self.calculator.calculate_corpus_perplexity( bigram_language_model_smoothing, corpus), 2) assert 7.77 == round( self.calculator.calculate_corpus_perplexity( trigram_language_model, corpus), 2) assert 17.56 == round( self.calculator.calculate_corpus_perplexity( trigram_language_model_interpolating, corpus), 2) assert 2.40 == round( self.calculator.calculate_corpus_perplexity( quadrigram_language_model, corpus), 2)
def test_language_models_on_reuters_corpus(self): reuters = ReutersTrainingCorpus().get_sentences() slice_index = round(len(reuters) * .9) training = Corpus(reuters[:slice_index]) test = Corpus(reuters[slice_index:]) unigram_language_model = UnigramLanguageModel(training) bigram_language_model = NgramLanguageModel( training, 2, BackoffNgramProbabilityCalculator) trigram_language_model = NgramLanguageModel( training, 3, BackoffNgramProbabilityCalculator) trigram_language_model_interpolating = NgramLanguageModel( training, 3, InterpolatingNgramProbabilityCalculator) quadrigram_language_model = NgramLanguageModel( training, 4, BackoffNgramProbabilityCalculator) assert 1021.63 == round( self.calculator.calculate_corpus_perplexity( unigram_language_model, test), 2) assert 176.52 == round( self.calculator.calculate_corpus_perplexity( bigram_language_model, test), 2) assert 171.96 == round( self.calculator.calculate_corpus_perplexity( trigram_language_model, test), 2) assert 189.32 == round( self.calculator.calculate_corpus_perplexity( trigram_language_model_interpolating, test), 2) assert 298.98 == round( self.calculator.calculate_corpus_perplexity( quadrigram_language_model, test), 2)
def test_language_models_on_somewhat_simple_corpus(self): training = Corpus([['a', 'b', 'c'], ['b', 'c', 'd'], ['c', 'd', 'e']]) test = Corpus([['a', 'b', 'c'], ['a', 'c', 'd']]) unigram_language_model = UnigramLanguageModel(training) bigram_language_model = BigramLanguageModel(training) bigram_language_model_smoothing = NgramLanguageModel( training, 2, LaplaceSmoothingNgramProbabilityCalculator) trigram_language_model = NgramLanguageModel( training, 3, NgramProbabilityCalculator) trigram_language_model_backoff = NgramLanguageModel( training, 3, BackoffNgramProbabilityCalculator) trigram_language_model_interpolating = NgramLanguageModel( training, 3, InterpolatingNgramProbabilityCalculator) assert 4.95 == round( self.calculator.calculate_corpus_perplexity( unigram_language_model, test), 2) assert 1.68 == round( self.calculator.calculate_corpus_perplexity( bigram_language_model, test), 2) assert 3.49 == round( self.calculator.calculate_corpus_perplexity( bigram_language_model_smoothing, test), 2) assert 1.73 == round( self.calculator.calculate_corpus_perplexity( trigram_language_model, test), 2) assert 2.93 == round( self.calculator.calculate_corpus_perplexity( trigram_language_model_backoff, test), 2) assert 3.04 == round( self.calculator.calculate_corpus_perplexity( trigram_language_model_interpolating, test), 2)
def find_lambdas_max_estimation_multiple_slicings(self, corpus, n, slicings): sentences = corpus.get_sentences() slice_size = round(len(sentences) * (1/slicings)) guesses = [] for i in range(slicings): slice_index = round(slice_size * i) slice_end = slice_index + slice_size training = Corpus(sentences[:slice_index] + sentences[slice_end:]) holdout = Corpus(sentences[slice_index:slice_end]) best_lambdas = InterpolationTrainer().find_lambdas_max_estimation(training, holdout, 3) guesses.append(best_lambdas) perplexities = [0] * slicings for slicing in range(slicings): slice_index = round(slice_size * i) slice_end = slice_index + slice_size training = Corpus(sentences[:slice_index] + sentences[slice_end:]) holdout = Corpus(sentences[slice_index:slice_end]) for guess in range(slicings): calculator = InterpolatingNgramProbabilityCalculator(NgramCalculatorContainer(training, 3), guesses[i]) max_estimation_language_model = NgramLanguageModel(training, 3, ngram_probability_calculator=calculator) perplexity = PerplexityCalculator().calculate_corpus_perplexity(max_estimation_language_model, holdout) perplexities[guess] += perplexity return guesses[perplexities.index(min(perplexities))]
def q6_minimize_perplexity_of_reuters_corpus(): training = ReutersTrainingCorpus() test = ReutersTestCorpus() lambdas = [0.2518, 0.4691, 0.2791] calculator = InterpolatingNgramProbabilityCalculator( NgramCalculatorContainer(training, 3), lambdas) language_model = NgramLanguageModel( training, 3, ngram_probability_calculator=calculator) perplexity = PerplexityCalculator().calculate_corpus_perplexity( language_model, test) print("Test perplexity best model: ", perplexity)
def test_language_models_on_very_simple_corpus(self): corpus = Corpus([['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']]) unigram_language_model = UnigramLanguageModel(corpus) bigram_language_model = BigramLanguageModel(corpus) trigram_language_model = NgramLanguageModel( corpus, 3, NgramProbabilityCalculator) assert 10 == round( self.calculator.calculate_corpus_perplexity( unigram_language_model, corpus), 10) assert 1 == round( self.calculator.calculate_corpus_perplexity( bigram_language_model, corpus), 10) assert 1 == round( self.calculator.calculate_corpus_perplexity( trigram_language_model, corpus), 10)
def q7_perplexity_oov_of_brown_corpus(): training = ReutersTrainingCorpus() brown = BrownCorpus() lambdas = [0.2518, 0.4691, 0.2791] calculator = InterpolatingNgramProbabilityCalculator( NgramCalculatorContainer(training, 3), lambdas) language_model = NgramLanguageModel( training, 3, ngram_probability_calculator=calculator) perplexity = PerplexityCalculator().calculate_corpus_perplexity( language_model, brown) oov_calculator = OutOfVocabularyRateCalculator() oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, brown) print("Brown perplexity best model: ", perplexity) print("Brown out of vocabulary rate: ", oov_rate)
def test_trainer_on_reuters_corpus(self): reuters = ReutersTrainingCorpus().get_sentences() slice_size = round(len(reuters) * .1) guesses = [] for i in range(10): slice_index = round(slice_size * i) slice_end = slice_index + slice_size training = Corpus(reuters[:slice_index] + reuters[slice_end:]) holdout = Corpus(reuters[slice_index:slice_end]) best_lambdas = InterpolationTrainer().find_lambdas_max_estimation( training, holdout, 3) guesses.append(best_lambdas) perplexities = [0] * 10 for slicing in range(10): slice_index = round(slice_size * i) slice_end = slice_index + slice_size training = Corpus(reuters[:slice_index] + reuters[slice_end:]) holdout = Corpus(reuters[slice_index:slice_end]) for guess in range(10): calculator = InterpolatingNgramProbabilityCalculator( NgramCalculatorContainer(training, 3), guesses[guess]) max_estimation_language_model = NgramLanguageModel( training, 3, ngram_probability_calculator=calculator) perplexity = PerplexityCalculator( ).calculate_corpus_perplexity(max_estimation_language_model, holdout) perplexities[guess] += perplexity min_perplexity = min(perplexities) best_lambdas = guesses[perplexities.index(min_perplexity)] for i in range(10): print(perplexities[i] / 10, guesses[i]) print(min_perplexity, best_lambdas)
def find_lambdas_brute_force(self, training_corpus, holdout_corpus, n): ngram_counter = NgramCalculatorContainer(training_corpus, n) minimum_perplexity = sys.maxsize best_lambdas = [] perplexity_calculator = PerplexityCalculator() increment = .1 tries = round(1 / increment) for i in range(tries + 1): l1 = round(increment * i, 2) for j in range(tries-i): l2 = round(increment * j, 2) l3 = round(1 - l1 - l2, 2) lambdas = [l1, l2, l3] ngram_probability_calculator = InterpolatingNgramProbabilityCalculator(ngram_counter, lambdas) language_model = NgramLanguageModel( training_corpus, n, ngram_probability_calculator=ngram_probability_calculator ) perplexity = perplexity_calculator.calculate_corpus_perplexity(language_model, holdout_corpus) if perplexity < minimum_perplexity: minimum_perplexity = perplexity best_lambdas = lambdas return best_lambdas, minimum_perplexity