def part_one(train, dev, test): # Unigram Model was my first attempt -- ngram was ultimately could handle UNIGRAM but keeping to 'show my work' print("{:>10} {:>20} {:>20} {:>20}".format("", "brown.dev", "brown.train", "brown.test")) unigramModel = UnigramModel(train) perplexity_dev = calculate_perplexity(dev, unigramModel.probabilities) perplexity_train = calculate_perplexity(train, unigramModel.probabilities) perplexity_test = calculate_perplexity(test, unigramModel.probabilities) print("{:>10} {:=20} {:=20} {:=20}".format("unigram", perplexity_dev, perplexity_train, perplexity_test)) # bigrams bigram = ngram(train, 2) perplexity_dev = calculate_ngram_perplexity(dev, bigram.vocabulary_space, bigram.probabilities, 2) perplexity_train = calculate_ngram_perplexity(train, bigram.vocabulary_space, bigram.probabilities, 2) perplexity_test = calculate_ngram_perplexity(test, bigram.vocabulary_space, bigram.probabilities, 2) print("{:>10} {:=20} {:=20} {:=20}".format("bigram", perplexity_dev, perplexity_train, perplexity_test)) # trigrams trigram = ngram(train, 3) perplexity_dev = calculate_ngram_perplexity(dev, bigram.vocabulary_space, bigram.probabilities, 3) perplexity_train = calculate_ngram_perplexity(train, bigram.vocabulary_space, bigram.probabilities, 3) perplexity_test = calculate_ngram_perplexity(test, bigram.vocabulary_space, bigram.probabilities, 3) print("{:>10} {:=20} {:=20} {:=20}".format("trigram", perplexity_dev, perplexity_train, perplexity_test))
def part_two(train, dev, test): # # trigrams w/ K-Smoothing n = 3 # For playing with higher n-grams only print("{:>10} {:>20} {:>20} {:>20}".format("Add K =", "brown.dev", "brown.train", "brown.test")) k = 10 for i in range(6): trigram_smooth = ngram(train, n, k) perplexity_dev = calculate_ngram_perplexity( dev, trigram_smooth.vocabulary_space, trigram_smooth.probabilities, n, smoothed=True) perplexity_train = calculate_ngram_perplexity( train, trigram_smooth.vocabulary_space, trigram_smooth.probabilities, n, smoothed=True) perplexity_test = calculate_ngram_perplexity( test, trigram_smooth.vocabulary_space, trigram_smooth.probabilities, n, smoothed=True) print("{:>10} {:=20} {:=20} {:=20}".format(k, perplexity_dev, perplexity_train, perplexity_test)) k = k / 10 # order of magnitude step down
def test_Experiment_unigram(self): ngrammer = ngram(TESTDATA_TXT, 1) self.assertEqual(10, ngrammer.count_ngrams()) self.assertEqual(2, ngrammer.distinct_ngrams()) # unks
def test_counts_ngrams_correctly_for_trigram(self): ngrammer = ngram(TESTDATA_TXT, 3) self.assertEqual(10, ngrammer.count_ngrams())
def __init__(self, path_to_training_data): self.unigram = ngram(path_to_training_data, 1) self.bigram = ngram(path_to_training_data, 2) self.trigram = ngram(path_to_training_data, 3)