Example #1
0
    # 保存到文件
    with open('corpus.pkl','wb') as file:
        pickle.dump(corpus,file)
else:
    # 调用上次保存的结果
    with open('corpus.pkl','rb') as file:
        corpus = pickle.load(file)

import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

# 计算corpus的TF——IDF矩阵(即分词的重要程度)
countvectorizer = CountVectorizer(encoding ='gb18030',min_df = 0.015) # 最小阈值是0.015,也就是不重要的词不予考虑
tfidftransformer = TfidfTransformer()
countvectorizer = countvectorizer.fit_transform(corpus)
tfidf = tfidftransformer.fit_transformer(corpus)

# 标记是否为自己的新闻 # map会根据提供的函数对指定序列做映射
label = list(map(lambda source:1 if '新华' in str(source) else 0, news.source)) # labmbda 定义一个简单的函数
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# 数据集切分
X_train,X_text,y_train,y_test = train_test_split(tfidf.toarray(),label,test_size=0.3) # test_size 为比例
model = MultinomialNB()
model.fit(X_train,y_train)
# 使用model来检测新闻风格
# y_predict = model.predict(X_text)
prediction = model.predict(tfidf.toarray())
labels = np.array(label) # 把得到的列表转化一下
compare_news-index = pd.DataFrame({'prediction':prediction,'labels':labels})
# 可能的抄袭的内容特征: labels不同,但是预测值是一样的