def __init__(self, corpus): """ :param corpus: corpus over which to create the language model """ unigram_calculator = UnigramCalculator(corpus) self.unigrams, self.corpus_unigram_length = unigram_calculator.calculate_unigrams() self.corpus_sentence_length = len(corpus.get_sentences())
class TestUnigramCalculator: corpus = Corpus() unigram_calculator = UnigramCalculator(corpus) def test_calculate_unigrams(self): assert 20 == self.unigram_calculator.total_unigrams assert self.corpus.unigrams == self.unigram_calculator.unigrams def test_get_percentage_unique_unigrams(self): assert 16 / 18 == self.unigram_calculator.get_percentage_unique_unigrams( )
def __init__(self, corpus, n): self.n = n self.corpus_sentence_length = len(corpus.get_sentences()) self.unigram_calculator = UnigramCalculator(corpus) self.corpus_unigram_length = self.unigram_calculator.total_unigrams self.ngram_calculators = [] for i in range(2, n + 1): self.ngram_calculators.append( NgramCalculator(corpus, i, True, False))
def q2_calculate_unique_unigrams_in_reuters_training(): corpus = ReutersTrainingCorpus() unigram_calculator = UnigramCalculator(corpus) ngram_calculator = BasicNgramCalculator(corpus) unique_unigrams = unigram_calculator.get_percentage_unique_unigrams() unique_unigrams_ngram = ngram_calculator.get_percentage_unique_ngrams(1) print("Percentage of unique unigrams (unigram calculator): ", unique_unigrams) print("Percentage of unique unigrams (ngram calculator): ", unique_unigrams_ngram)