Ejemplo n.º 1
0
def part_one(train, dev, test):
    # Unigram Model was my first attempt -- ngram was ultimately could handle UNIGRAM but keeping to 'show my work'
    print("{:>10} {:>20} {:>20} {:>20}".format("", "brown.dev", "brown.train",
                                               "brown.test"))
    unigramModel = UnigramModel(train)
    perplexity_dev = calculate_perplexity(dev, unigramModel.probabilities)
    perplexity_train = calculate_perplexity(train, unigramModel.probabilities)
    perplexity_test = calculate_perplexity(test, unigramModel.probabilities)
    print("{:>10} {:=20} {:=20} {:=20}".format("unigram", perplexity_dev,
                                               perplexity_train,
                                               perplexity_test))
    # bigrams
    bigram = ngram(train, 2)
    perplexity_dev = calculate_ngram_perplexity(dev, bigram.vocabulary_space,
                                                bigram.probabilities, 2)
    perplexity_train = calculate_ngram_perplexity(train,
                                                  bigram.vocabulary_space,
                                                  bigram.probabilities, 2)
    perplexity_test = calculate_ngram_perplexity(test, bigram.vocabulary_space,
                                                 bigram.probabilities, 2)
    print("{:>10} {:=20} {:=20} {:=20}".format("bigram", perplexity_dev,
                                               perplexity_train,
                                               perplexity_test))
    # trigrams
    trigram = ngram(train, 3)
    perplexity_dev = calculate_ngram_perplexity(dev, bigram.vocabulary_space,
                                                bigram.probabilities, 3)
    perplexity_train = calculate_ngram_perplexity(train,
                                                  bigram.vocabulary_space,
                                                  bigram.probabilities, 3)
    perplexity_test = calculate_ngram_perplexity(test, bigram.vocabulary_space,
                                                 bigram.probabilities, 3)
    print("{:>10} {:=20} {:=20} {:=20}".format("trigram", perplexity_dev,
                                               perplexity_train,
                                               perplexity_test))
Ejemplo n.º 2
0
def part_two(train, dev, test):
    # # trigrams w/ K-Smoothing
    n = 3  # For playing with higher n-grams only
    print("{:>10} {:>20} {:>20} {:>20}".format("Add K =", "brown.dev",
                                               "brown.train", "brown.test"))
    k = 10
    for i in range(6):
        trigram_smooth = ngram(train, n, k)
        perplexity_dev = calculate_ngram_perplexity(
            dev,
            trigram_smooth.vocabulary_space,
            trigram_smooth.probabilities,
            n,
            smoothed=True)
        perplexity_train = calculate_ngram_perplexity(
            train,
            trigram_smooth.vocabulary_space,
            trigram_smooth.probabilities,
            n,
            smoothed=True)
        perplexity_test = calculate_ngram_perplexity(
            test,
            trigram_smooth.vocabulary_space,
            trigram_smooth.probabilities,
            n,
            smoothed=True)
        print("{:>10} {:=20} {:=20} {:=20}".format(k, perplexity_dev,
                                                   perplexity_train,
                                                   perplexity_test))
        k = k / 10  # order of magnitude step down
Ejemplo n.º 3
0
 def test_Experiment_unigram(self):
     ngrammer = ngram(TESTDATA_TXT, 1)
     self.assertEqual(10, ngrammer.count_ngrams())
     self.assertEqual(2, ngrammer.distinct_ngrams())  # unks
Ejemplo n.º 4
0
 def test_counts_ngrams_correctly_for_trigram(self):
     ngrammer = ngram(TESTDATA_TXT, 3)
     self.assertEqual(10, ngrammer.count_ngrams())
Ejemplo n.º 5
0
    def __init__(self, path_to_training_data):

        self.unigram = ngram(path_to_training_data, 1)
        self.bigram = ngram(path_to_training_data, 2)
        self.trigram = ngram(path_to_training_data, 3)