def q4_calculate_oov_rate_for_reuters_test_with_respect_to_training(): training = ReutersTrainingCorpus() test = ReutersTestCorpus() oov_calculator = OutOfVocabularyRateCalculator() oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, test) print("Out of vocabulary rate: ", oov_rate)
def q4_calculate_oov_rate_for_reuters_test_with_respect_to_training(): training = ReutersTrainingCorpus() test = ReutersTestCorpus() oov_calculator = OutOfVocabularyRateCalculator() oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, test) print("Out of vocabulary rate: ", oov_rate)
def q7_perplexity_oov_of_brown_corpus(): training = ReutersTrainingCorpus() brown = BrownCorpus() lambdas = [0.2518, 0.4691, 0.2791] calculator = InterpolatingNgramProbabilityCalculator(NgramCalculatorContainer(training, 3), lambdas) language_model = NgramLanguageModel(training, 3, ngram_probability_calculator=calculator) perplexity = PerplexityCalculator().calculate_corpus_perplexity(language_model, brown) oov_calculator = OutOfVocabularyRateCalculator() oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, brown) print("Brown perplexity best model: ", perplexity) print("Brown out of vocabulary rate: ", oov_rate)
def q7_perplexity_oov_of_brown_corpus(): training = ReutersTrainingCorpus() brown = BrownCorpus() lambdas = [0.2518, 0.4691, 0.2791] calculator = InterpolatingNgramProbabilityCalculator( NgramCalculatorContainer(training, 3), lambdas) language_model = NgramLanguageModel( training, 3, ngram_probability_calculator=calculator) perplexity = PerplexityCalculator().calculate_corpus_perplexity( language_model, brown) oov_calculator = OutOfVocabularyRateCalculator() oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, brown) print("Brown perplexity best model: ", perplexity) print("Brown out of vocabulary rate: ", oov_rate)
class TestOutOfVocabularyRateCalculator: training_corpus = [ ['how', 'many', 'roads', 'must', 'a', 'man', 'walk', 'down'], ['before', 'you', 'call', 'him', 'a', 'man'], ['how', 'many', 'seas', 'must', 'a', 'white', 'dove', 'sail'], ['before', 'she', 'sleeps', 'in', 'the', 'sand'] ] test_corpus = [ ['yes', 'and', 'how', 'many', 'times', 'must', 'the', 'cannon', 'balls', 'fly'], ['before', "they're", 'forever', 'banned'], ['the', 'answer', 'my', 'friend', 'is', "blowin'", 'in', 'the', 'wind'], ['the', 'answer', 'is', "blowin'", 'in', 'the', 'wind'] ] calculator = OutOfVocabularyRateCalculator() def test_calculate_out_of_vocabulary_rate(self): assert 19/30 == self.calculator.calculate_out_of_vocabulary_rate(Corpus(self.training_corpus), Corpus(self.test_corpus))