def trigram_model(tokenized_text, test_sentences, sentence_count):

    n = 3
    average_perplexity = 0.0
    train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
    model = KneserNeyInterpolated(n)
    model.fit(train_data, padded_vocab)

    tokenized_text = [
        list(map(str.lower, nltk.tokenize.word_tokenize(sent)))
        for sent in test_sentences
    ]

    test_data, _ = padded_everygram_pipeline(n, tokenized_text)

    for test in list(test_data):
        ngrams = list(test)
        if model.perplexity(ngrams) != float('inf'):
            average_perplexity += model.perplexity(ngrams)

    average_perplexity /= sentence_count
    print(
        f"Average Perplexity for Trigram model on Test tweets: {round(average_perplexity, 4)}"
    )
Esempio n. 2
0
sentences_strings_ted = [re.sub(r'[a-zA-Z0-9]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = filter(None, sentences_strings_ted)
data = ' '.join([re.sub(r'\s', '', sent) for sent in sentences_strings_ted]).split(' ')
datax = [' '.join(sent).split(' ') for sent in data]

del sentences_strings_ted, data

# 训练 5-gram
lm = KneserNeyInterpolated(5)
train, vocab = padded_everygram_pipeline(5, datax)
lm.fit(train, vocab)

del train, vocab, datax
# 困惑度测试
test = '我想带你们体验一下,我们所要实现的“信任”的感觉。'
sent_list = re.sub(r'[^\w\s]', '', test)
sent_list = ','.join(sent_list).split(',')
text = list(ngrams(pad_both_ends(sent_list, 5), 5))

entropy = lm.entropy(text)  # 交叉熵
perplexity = lm.perplexity(text)  # 困惑度
print('交叉熵:%f' % entropy, '困惑度:%f' % perplexity)
# 储存模型  ... 以下内容 内存不足跑不起来 去 Colaboratory 或者 kaggle 跑蹭谷歌服务器
joblib.dump(lm, 'panti_gram.pkl')
# In[]
# 测试储存的模型
kn = joblib.load('kn_5gram.pkl')
kn_entropy = kn.entropy(text)  # 交叉熵
kn_perplexity = kn.perplexity(text)  # 困惑度
print('KN交叉熵:%f' % kn_entropy, 'KN困惑度:%f' % kn_perplexity)