Ejemplo n.º 1
0
 def add_doc(self, tokens, author):
     if author not in self.freqdists:
         self.freqdists[author] = FreqDist()
         self.prob_dists[author] = KneserNeyProbDist(self.freqdists[author])
         self.needs_probs_recounted[author] = True
     fd = FreqDist(trigrams(tokens))
     self.freqdists[author].update(fd)
 def model_KN(self, contents):
     ''' function returns an unsmoothed probability distribution (n-gram model) based on parameter list
         passed:
         - contents : list containing repaired contents of file whose n-gram model is to be created
         
         Uses the KneserNeyProbDist() function from NLTK to create a Kneser-Ney smoothing based 
         language model
     '''
     ret_dict = {}
     list_ngrams = list(ngrams(contents, self.N))
     fdist = FreqDist(list_ngrams)
     kn_prob_dist = KneserNeyProbDist(fdist)
     self.discount_KN = kn_prob_dist.discount()
     for iter in kn_prob_dist.samples():
         ret_dict[iter] = kn_prob_dist.prob(iter)
     return ret_dict
    def predict_author(self, tokens):
        shape_toks = get_token_shapes(tokens)

        needs_probs_recounted = self.needs_probs_recounted
        prob_dists = self.prob_dists
        freqdists = self.freqdists
        smoothing = self.smoothing

        for author in freqdists:
            #Only recount those that have since been modified (by having a doc added)
            if needs_probs_recounted[author]:
                prob_dists[author] = KneserNeyProbDist(freqdists[author])
                self.needs_probs_recounted[author] = False

        best_score = None
        likely_author = None
        for author, probdist in prob_dists.iteritems():
            probs = array(
                [probdist.prob(trigram) for trigram in trigrams(shape_toks)],
                dtype='float')
            score = log(probs + smoothing).sum()

            if score > best_score:
                likely_author = author
                best_score = score

        return likely_author, best_score
Ejemplo n.º 4
0
class KneserNeyModel(BaseNgramModel):
    """
    Implements Kneser-Ney smoothing
    """
    def __init__(self, *args):
        super(KneserNeyModel, self).__init__(*args)
        self.model = KneserNeyProbDist(self.ngrams)

    def score(self, word, context):
        """
        Use KneserNeyProbDist from NLTK to get score
        """
        trigram = tuple((context[0], context[1], word))
        return self.model.prob(trigram)

    def samples(self):
        return self.model.samples()

    def prob(self, sample):
        return self.model.prob(sample)
Ejemplo n.º 5
0
    def complete(self, author, tokens, num_words, iters=100):
        if self.needs_probs_recounted[author]:
            self.prob_dists[author] = KneserNeyProbDist(self.freqdists[author])
            add_unigrams(self.prob_dists[author])

        context_tokens = list(tokens)
        #Chop off the end of tokens until we see a bigram we know.
        while context_tokens:
            if tuple(context_tokens[-2:]) in self.prob_dists[author]._bigrams:
                break
            context_tokens.pop(-1)

        context = tuple(context_tokens[-2:]) if context_tokens else (None,
                                                                     None)
        probdist = self.prob_dists[author]
        completion = generate(probdist, context, num_words, iters)
        return completion
Ejemplo n.º 6
0
print(out)
"""最大似然估计的目的就是:利用已知的样本结果,反推最有可能(最大概率)导致这样结果的参数值。"""
"""最大似然估计wiki https://zh.wikipedia.org/zh-cn/%E6%9C%80%E5%A4%A7%E4%BC%BC%E7%84%B6%E4%BC%B0%E8%AE%A1"""
"""隐马尔科夫模型估计 HMM"""
corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:700]
print(len(corpus))
tag_set = unique_list(tag for sent in corpus for (word, tag) in sent)
print(len(tag_set))
"""平滑"""
# gt = lambda fd, bins:SimpleGoodTuringProbDist(fd, bins=1e5)
# train_and_test(gt)
corpus = [[((x[0], y[0], z[0]), (x[1], y[1], z[1]))
           for x, y, z in nltk.trigrams(sent)]
          for sent in corpus[:100]]  # 平滑语料库
tag_set = unique_list(tag for sent in corpus for (word, tag) in sent)
print(len(tag_set))
symbols = unique_list(word for sent in corpus for (word, tag) in sent)
print(len(symbols))
trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
train_corpus = []
test_corpus = []
for i in range(len(corpus)):
    if i % 10:
        train_corpus += [corpus[i]]
    else:
        test_corpus += [corpus[i]]
print(len(train_corpus))
print(len(test_corpus))
kn = lambda fd, bins: KneserNeyProbDist(fd)
# train_and_test(kn)
Ejemplo n.º 7
0
 def __init__(self, *args):
     super(KneserNeyModel, self).__init__(*args)
     self.model = KneserNeyProbDist(self.ngrams)
Ejemplo n.º 8
0
if __name__ == '__main__':
    print("Lab 4 Exercise 2")
    corpus_reader = PlaintextCorpusReader(root="./twitter-files",
                                          fileids=".*\.txt",
                                          word_tokenizer=TweetTokenizer())

    # Convert tweets to tri-grams
    tweets = [tweet for tweet in corpus_reader.sents()]
    tweet_trigrams = [
        list(
            ngrams(sequence=tweet,
                   n=3,
                   pad_left=True,
                   pad_right=True,
                   left_pad_symbol="<START>",
                   right_pad_symbol="<END>")) for tweet in tweets
    ]
    all_trigrams = [gram for tweet in tweet_trigrams for gram in tweet]

    # Initialize the language model
    freq_dist = FreqDist(all_trigrams)
    model = KneserNeyProbDist(freq_dist)

    # Predict sentences
    inputs = [
        "make America", "I am the", "China is", "The President of",
        "This election", "I love", "Fake News"
    ]
    print("Inputs: {}".format(inputs))
    complete_sentence(inputs, model)
Ejemplo n.º 9
0
            best_generated = list(generated)
            best_score = prob
    return best_generated, best_score**(1. / num)  #geometric mean


def add_unigrams(kn):
    unigrams = {}
    for k, v in kn._bigrams.iteritems():
        for w in k:
            unigrams[w] = unigrams.get(w, 0) + 1.
    kn._unigrams = unigrams


if __name__ == '__main__':
    from cPickle import load
    from code import interact
    animal_farm_toks = load(open('animal_farm_toks'))
    niniteen_eightyfour_farm_toks = load(open('1984_toks'))

    fd = FreqDist(trigrams(animal_farm_toks))
    #  fd.update(FreqDist(trigrams(niniteen_eightyfour_farm_toks)))

    kn = KneserNeyProbDist(fd)
    add_unigrams(kn)
    #print generate(kn, ('the', 'day'), 10, 100)
    interact(local=locals())
    #http://www.gilesthomas.com/2010/05/generating-political-news-using-nltk/
    #content_model = NgramModel(3, tokenized_content)
    #starting_words = content_model.generate(100)[-2:]
    #content = content_model.generate(words_to_generate, starting_words)
    #print u' '.join(content)