Ejemplo n.º 1
0
from nltk.corpus import gutenberg
from nltk import ConditionalFreqDist
from random import choice

#create the distribution object
cfd = ConditionalFreqDist()

## for each token count the current word given the previous word
prev_word = None
for word in gutenberg.words('austen-persuasion.txt'):
    cfd[prev_word][word] += 1
    prev_word = word

## start predicting at given word, say "therefore"
word = "therefore"
i = 1

## find all words that can follow the given word and choose one at random
while i<20:
    print word,
    lwords = cfd.get(word).keys()
    follower = choice(lwords)
    word = follower
    i += 1
    

Ejemplo n.º 2
0
file = open('sampledata.txt', 'r')
filetext = file.read()
filetext = filetext.replace('</s>', '')
filetext = filetext.replace('<s>', '')
tokens = word_tokenize(filetext)
tokens.append('<s>')
print(set(tokens))
vocab2 = vocab
vocab2.append('</s>')
vocab2.append('UTK')
big = bigrams(tokens)
cfds = ConditionalFreqDist((w0, w1) for w0, w1 in big)
print(cfds.items())
for v3 in vocab2:
    Unk2 = 0
    fr2 = cfds.get(v3)
    if (fr2 != None):
        for i in fr2.items():
            unigramCount = 0
            for s in fr.items():
                if v3 == s[0]:
                    unigramCount = s[1]
            print('P(' + v3 + '|' + str(i[0]) + ') = ' +
                  str((i[1] / unigramCount).__round__(2)))
    else:
        Unk2 += 1
    print('P(' + v3 + '|UNK) = ' + str(Unk2))
print('======= BIGRAMS SMOOTHING =======')
for v3 in vocab2:
    Unk2 = 0
    fr2 = cfds.get(v3)