def q4_calculate_oov_rate_for_reuters_test_with_respect_to_training():
    training = ReutersTrainingCorpus()
    test = ReutersTestCorpus()
    oov_calculator = OutOfVocabularyRateCalculator()

    oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, test)
    print("Out of vocabulary rate: ", oov_rate)
Example #2
0
def q4_calculate_oov_rate_for_reuters_test_with_respect_to_training():
    training = ReutersTrainingCorpus()
    test = ReutersTestCorpus()
    oov_calculator = OutOfVocabularyRateCalculator()

    oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, test)
    print("Out of vocabulary rate: ", oov_rate)
def q7_perplexity_oov_of_brown_corpus():
    training = ReutersTrainingCorpus()
    brown = BrownCorpus()
    lambdas = [0.2518, 0.4691, 0.2791]
    calculator = InterpolatingNgramProbabilityCalculator(NgramCalculatorContainer(training, 3), lambdas)
    language_model = NgramLanguageModel(training, 3, ngram_probability_calculator=calculator)
    perplexity = PerplexityCalculator().calculate_corpus_perplexity(language_model, brown)

    oov_calculator = OutOfVocabularyRateCalculator()
    oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, brown)

    print("Brown perplexity best model: ", perplexity)
    print("Brown out of vocabulary rate: ", oov_rate)
Example #4
0
def q7_perplexity_oov_of_brown_corpus():
    training = ReutersTrainingCorpus()
    brown = BrownCorpus()
    lambdas = [0.2518, 0.4691, 0.2791]
    calculator = InterpolatingNgramProbabilityCalculator(
        NgramCalculatorContainer(training, 3), lambdas)
    language_model = NgramLanguageModel(
        training, 3, ngram_probability_calculator=calculator)
    perplexity = PerplexityCalculator().calculate_corpus_perplexity(
        language_model, brown)

    oov_calculator = OutOfVocabularyRateCalculator()
    oov_rate = oov_calculator.calculate_out_of_vocabulary_rate(training, brown)

    print("Brown perplexity best model: ", perplexity)
    print("Brown out of vocabulary rate: ", oov_rate)
Example #5
0
class TestOutOfVocabularyRateCalculator:
    training_corpus = [
        ['how', 'many', 'roads', 'must', 'a', 'man', 'walk', 'down'],
        ['before', 'you', 'call', 'him', 'a', 'man'],
        ['how', 'many', 'seas', 'must', 'a', 'white', 'dove', 'sail'],
        ['before', 'she', 'sleeps', 'in', 'the', 'sand']
    ]
    test_corpus = [
        ['yes', 'and', 'how', 'many', 'times', 'must', 'the', 'cannon', 'balls', 'fly'],
        ['before', "they're", 'forever', 'banned'],
        ['the', 'answer', 'my', 'friend', 'is', "blowin'", 'in', 'the', 'wind'],
        ['the', 'answer', 'is', "blowin'", 'in', 'the', 'wind']
    ]

    calculator = OutOfVocabularyRateCalculator()

    def test_calculate_out_of_vocabulary_rate(self):
        assert 19/30 == self.calculator.calculate_out_of_vocabulary_rate(Corpus(self.training_corpus),
                                                                             Corpus(self.test_corpus))