Ejemplo n.º 1
0
    def conditional_freq(self):
        result = []
        cfd = ConditionalFreqDist(self.bigram_list)

        for key, values in cfd.items():
            for word, freq in values.items():
                result.append((key, word, freq))

        return result
Ejemplo n.º 2
0
class BigramWordCandidateProvider(object):
    """Provides candidate next words given a word using a bigram model."""
    def __init__(self, corpus):
        """Initializer of the BigramWordCandidateProvider.

        Args:
            corpus: An iterable of word strings.
        """
        _bigrams = bigrams(corpus)
        self._cfd = ConditionalFreqDist(_bigrams)

    def candidates(self, word_sequence):
        """Returns a list of candidate next words given a word sequence.
        """
        word = word_sequence[-1]
        candidates = [
            candidate for (candidate, _) in self._cfd[word].most_common()
        ]
        return candidates

    def random_word(self):
        return random.choice(list(self._cfd.items()))[0]
Ejemplo n.º 3
0
#%%
from nltk.corpus import inaugural
from nltk import ConditionalFreqDist
from nltk.probability import FreqDist

fd3 = FreqDist([s for s in inaugural.words()])
print(fd3.freq('freedom'))

# count frequency of words length in decending order
cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids()
                          for w in inaugural.words(fileid)
                          if fileid > '1980' and fileid < '2010')

print(cfd.items())
cfd.plot()
# %%
Ejemplo n.º 4
0
        UNK += lpt.prob(r[0])
print('UNK   |       ', UNK)
print('=========== BIGRAMS ===========')
file = open('sampledata.txt', 'r')
filetext = file.read()
filetext = filetext.replace('</s>', '')
filetext = filetext.replace('<s>', '')
tokens = word_tokenize(filetext)
tokens.append('<s>')
print(set(tokens))
vocab2 = vocab
vocab2.append('</s>')
vocab2.append('UTK')
big = bigrams(tokens)
cfds = ConditionalFreqDist((w0, w1) for w0, w1 in big)
print(cfds.items())
for v3 in vocab2:
    Unk2 = 0
    fr2 = cfds.get(v3)
    if (fr2 != None):
        for i in fr2.items():
            unigramCount = 0
            for s in fr.items():
                if v3 == s[0]:
                    unigramCount = s[1]
            print('P(' + v3 + '|' + str(i[0]) + ') = ' +
                  str((i[1] / unigramCount).__round__(2)))
    else:
        Unk2 += 1
    print('P(' + v3 + '|UNK) = ' + str(Unk2))
print('======= BIGRAMS SMOOTHING =======')