def __init__(self, corpus):
     """
     :param corpus: corpus over which to create the language model
     """
     unigram_calculator = UnigramCalculator(corpus)
     self.unigrams, self.corpus_unigram_length = unigram_calculator.calculate_unigrams()
     self.corpus_sentence_length = len(corpus.get_sentences())
Exemple #2
0
class TestUnigramCalculator:
    corpus = Corpus()
    unigram_calculator = UnigramCalculator(corpus)

    def test_calculate_unigrams(self):
        assert 20 == self.unigram_calculator.total_unigrams
        assert self.corpus.unigrams == self.unigram_calculator.unigrams

    def test_get_percentage_unique_unigrams(self):
        assert 16 / 18 == self.unigram_calculator.get_percentage_unique_unigrams(
        )
Exemple #3
0
    def __init__(self, corpus, n):
        self.n = n
        self.corpus_sentence_length = len(corpus.get_sentences())

        self.unigram_calculator = UnigramCalculator(corpus)
        self.corpus_unigram_length = self.unigram_calculator.total_unigrams

        self.ngram_calculators = []
        for i in range(2, n + 1):
            self.ngram_calculators.append(
                NgramCalculator(corpus, i, True, False))
Exemple #4
0
def q2_calculate_unique_unigrams_in_reuters_training():
    corpus = ReutersTrainingCorpus()
    unigram_calculator = UnigramCalculator(corpus)
    ngram_calculator = BasicNgramCalculator(corpus)

    unique_unigrams = unigram_calculator.get_percentage_unique_unigrams()
    unique_unigrams_ngram = ngram_calculator.get_percentage_unique_ngrams(1)

    print("Percentage of unique unigrams (unigram calculator): ",
          unique_unigrams)
    print("Percentage of unique unigrams (ngram calculator): ",
          unique_unigrams_ngram)