# load bigram
    bigrams = load_bigram(open(argv[1]))
    # load unigram
    uni = load_unigram(open(argv[2]))
    words = uni['words']
    total_tokens = uni['total_tokens']
    training_gram = Ngram(total_tokens, words, bigrams)
    # load x, y, smooth_method
    x = argv[3]
    y = argv[4]
    # exit if x is non-existent in training
    if argv[3] not in words:
        print 'We are incredibly sorry, but the word you requested was not found in the training set'
        exit()
    # return probability if bigram has been seen in training
    if (x, y) in bigrams:
        print 'Pr({}|{}) = {}'.format(
            y, x, training_gram.get_prob(x, y, smooth_index))
    else:
        # bigram (x,y) has not been seen, calculate probability for specific smoothing for bigram (x,y)
        if smooth_method == 'M':
            print "Pr({}|{}) = {}".format(y, x, training_gram.mle(x, y))
        elif smooth_method == 'L':
            print "Pr({}|{}) = {}".format(y, x,
                                          training_gram.laplace_bigram(x, y))
        elif smooth_method == 'I':
            print "Pr({}|{}) = {}".format(
                y, x, training_gram.interpolation(x, y, 0.3))
        else:
            print "Pr({}|{}) = {}".format(y, x, training_gram.pr_k(x, y))
Beispiel #2
0
 # calculate perplexities
 test_words = []
 sentences = sentence_segmentation(open(argv[3]))
 for sen in sentences:
     tokens = tokenization(start_sym + ' ' + sen + ' ' + end_sym)
     for tok in tokens:
         if tok == '':
             continue
         tok = tok.lower()
         test_words.append(tok)
 test_size = len(test_words)
 # calculating perplexities
 bi_perplexity = 0
 inter_perplexity = 0
 uni_perplexity = 0
 x = start_sym
 # calculate summation
 uni_perplexity += log(training_gram.laplace_unigram(x), 2)
 for y in test_words[1:]:
     bi_perplexity += log(training_gram.laplace_bigram(x, y), 2)
     inter_perplexity += log(training_gram.interpolation(x, y, 0.3), 2)
     uni_perplexity += log(training_gram.laplace_unigram(y), 2)
     x = y
 # calculate perplexities
 bi_perplexity = pow(2, (-1 / float(test_size)) * bi_perplexity)
 inter_perplexity = pow(2, (-1 / float(test_size)) * inter_perplexity)
 uni_perplexity = pow(2, (-1 / float(test_size)) * uni_perplexity)
 # output perplexities
 out = "Laplace Bigram: {}\nInterpolated Bigram: {}\nLaplace Unigram: {}".format(
     str(bi_perplexity), str(inter_perplexity), str(uni_perplexity))
 print out