-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment.py
346 lines (275 loc) · 10.7 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# !/usr/bin/env python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import jieba # 结巴分词
import socketserver
# gensim用来加载预训练word vector
from gensim.models import KeyedVectors
import warnings
# 使用gensim加载预训练中文分词embedding
cn_model = KeyedVectors.load_word2vec_format('chinese_word_vectors/sgns.zhihu.bigram', binary=False)
# 由此可见每一个词都对应一个长度为300的向量
embedding_dim = cn_model['山东大学'].shape[0]
print('词向量的长度为{}'.format(embedding_dim))
# print(cn_model['山东大学'])
# 计算相似度
print(cn_model.similarity('橘子', '橙子'))
# 找出最相近的词,余弦相似度
print(cn_model.most_similar(positive=['大学'], topn=10))
# 找出不同的词
test_words = '老师 会计师 程序员 律师 医生 老人'
test_words_result = cn_model.doesnt_match(test_words.split())
print('在 %s 中:\n不是同一类别的词为: %s' % (test_words, test_words_result))
pos_txts = os.listdir('pos')
neg_txts = os.listdir('neg')
print("样本总共: %s" % str(len(pos_txts) + len(neg_txts)))
# 现在我们将所有的评价内容放置到一个list里
train_texts_orig = []
# 添加完所有样本之后,train_texts_orig为一个含有4000条文本的list
# 其中前2000条文本为正面评价,后2000条为负面评价
for i in range(len(pos_txts)):
with open('pos/'+pos_txts[i], 'r', errors='ignore') as f:
text = f.read().strip()
train_texts_orig.append(text)
f.close()
for i in range(len(neg_txts)):
with open('neg/'+neg_txts[i], 'r', errors='ignore') as f:
text = f.read().strip()
train_texts_orig.append(text)
f.close()
print('总共 %s 条内容' % len(train_texts_orig))
# 使用tensorflow的keras接口来建模
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
# 进行分词和tokenize
# train_tokens是一个长长的list,其中含有4000个小list,对应每一条评价
train_tokens = []
for text in train_texts_orig:
# 去掉标点
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",text)
# 结巴分词
cut = jieba.cut(text)
# 结巴分词的输出结果为一个生成器
# 把生成器转换为list
cut_list = [ i for i in cut ]
for i, word in enumerate(cut_list):
try:
# 将词转换为索引index
cut_list[i] = cn_model.vocab[word].index
except KeyError:
# 如果词不在字典中,则输出0
cut_list[i] = 0
train_tokens.append(cut_list)
# 索引长度标准化
# 获得所有tokens的长度
num_tokens = [len(tokens) for tokens in train_tokens ]
num_tokens = np.array(num_tokens)
print("平均tokens的长度:%s" % np.mean(num_tokens))
print("最长的评价tokens的长度:%s" % np.max(num_tokens))
# plt.hist(np.log(num_tokens), bins=100)
# plt.xlim((0,10))
# plt.ylabel('number of tokens')
# plt.xlabel('length of tokens')
# plt.title('Distribution of tokens length')
# plt.show()
# 取tokens平均值并加上两个tokens的标准差,
# 假设tokens长度的分布为正态分布,则max_tokens这个值可以涵盖95%左右的样本
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print(max_tokens)
# 取tokens的长度为236时,大约95%的样本被涵盖
# 我们对长度不足的进行padding,超长的进行修剪
np.sum(num_tokens < max_tokens) / len(num_tokens)
# 反向tokenize
# 用来将tokens转换为文本
def reverse_tokens(tokens):
text = ''
for i in tokens:
if i != 0:
text = text + cn_model.index2word[i]
else:
text = text + ' '
return text
# 经过tokenize再恢复成文本
# 可见标点符号都没有了
reverse = reverse_tokens(train_tokens[0])
print(reverse)
# 原始文本
print(train_texts_orig[0])
# 只使用前20000个词
num_words = 50000
# 初始化embedding_matrix,之后在keras上进行应用
embedding_matrix = np.zeros((num_words, embedding_dim))
# embedding_matrix为一个 [num_words,embedding_dim] 的矩阵
# 维度为 50000 * 300
for i in range(num_words):
embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]
embedding_matrix = embedding_matrix.astype('float32')
# 检查index是否对应,
# 输出300意义为长度为300的embedding向量一一对应
np.sum(cn_model[cn_model.index2word[333]] == embedding_matrix[333])
# embedding_matrix的维度,
# 这个维度为keras的要求,后续会在模型中用到
print(embedding_matrix.shape)
# 进行padding和truncating, 输入的train_tokens是一个list
# 返回的train_pad是一个numpy array
train_pad = pad_sequences(train_tokens, maxlen=max_tokens, padding='pre', truncating='pre')
# 超出五万个词向量的词用0代替
train_pad[train_pad >= num_words] = 0
# 可见padding之后前面的tokens全变成0,文本在最后面
# train_pad[33]
# 准备target向量,前2000样本为1,后2000为0
train_target = np.concatenate((np.ones(2000), np.zeros(2000)))
# 进行训练和测试样本的分割
from sklearn.model_selection import train_test_split
# 90%的样本用来训练,剩余10%用来测试
X_train, X_test, y_train, y_test = train_test_split(train_pad,
train_target,
test_size=0.1,
random_state=12)
# 查看训练样本,确认无误
print(reverse_tokens(X_train[35]))
print('class: ', y_train[35])
# 用LSTM对样本进行分类
model = Sequential()
# 模型第一层为embedding
model.add(Embedding(num_words,
embedding_dim,
weights=[embedding_matrix],
input_length=max_tokens,
trainable=False))
model.add(Bidirectional(LSTM(units=32, return_sequences=True)))
model.add(LSTM(units=16, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
# 我们使用adam以0.001的learning rate进行优化
optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
optimizer=optimizer,
metrics=['accuracy'])
print(model.summary())
# 建立一个权重的存储点
path_checkpoint = 'sentiment_checkpoint.keras'
checkpoint = ModelCheckpoint(filepath=path_checkpoint, monitor='val_loss',
verbose=1, save_weights_only=True,
save_best_only=True)
# 尝试加载已训练模型
try:
model.load_weights(path_checkpoint)
graph = tf.get_default_graph()
except Exception as e:
graph = tf.get_default_graph()
print(e)
# 定义early stoping如果3个epoch内validation loss没有改善则停止训练
earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
# 自动降低learning rate
lr_reduction = ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_lr=1e-5, patience=0, verbose=1)
# 定义callback函数
callbacks = [
earlystopping,
checkpoint,
lr_reduction
]
# 开始训练
print('开始训练...')
# model.fit(X_train, y_train,
# validation_split=0.1,
# epochs=20,
# batch_size=128,
# callbacks=callbacks)
result = model.evaluate(X_test, y_test)
print('Accuracy:{0:.2%}'.format(result[1]))
# def predict_sentiment(text):
# print(text)
# # 去标点
# text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",text)
# # 分词
# cut = jieba.cut(text)
# cut_list = [i for i in cut]
# # tokenize
# for i, word in enumerate(cut_list):
# try:
# cut_list[i] = cn_model.vocab[word].index
# except KeyError:
# cut_list[i] = 0
# # padding
# tokens_pad = pad_sequences([cut_list], maxlen=max_tokens,
# padding='pre', truncating='pre')
# # 预测
# result = model.predict(x=tokens_pad)
# coef = result[0][0]
# if coef >= 0.5:
# sed = '是一例正面评价, output=%.2f' % coef
# else:
# sed = '是一例负面评价, output=%.2f' % coef
# print(sed)
# return sed
#
# test_list = [
# '你好啊',
# '酒店设施不是新的,服务态度很不好',
# '酒店卫生条件非常不好',
# '床铺非常舒适',
# '房间很凉,不给开暖气',
# '房间很凉爽,空调冷气很足',
# '酒店环境不好,住宿体验很不好',
# '房间隔音不到位' ,
# '晚上回来发现没有打扫卫生',
# '因为过节所以要我临时加钱,比团购的价格贵'
# ]
# for text in test_list:
# predict_sentiment(text)
class MyServer(socketserver.BaseRequestHandler):
"""
socketserver 服务端
"""
def handle(self):
conn = self.request
conn.sendall('欢迎访问socketserver服务器!'.encode())
while True:
data = conn.recv(1024).decode()
if data == "exit":
print("断开与%s的连接!" % (self.client_address,))
break
ret1 = self.predict_sentiment("晚上回来发现没有打扫卫生")
print(ret1)
ret = self.predict_sentiment(data)
print("%s" % data)
print("%s" % ret)
conn.sendall(('%s' % ret).encode())
def predict_sentiment(self, text):
print(text)
# 去标点
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", text)
# 分词
cut = jieba.cut(text)
cut_list = [i for i in cut]
# tokenize
for i, word in enumerate(cut_list):
try:
cut_list[i] = cn_model.vocab[word].index
except KeyError:
cut_list[i] = 0
# padding
tokens_pad = pad_sequences([cut_list], maxlen=max_tokens,
padding='pre', truncating='pre')
# 预测
with graph.as_default():
result = model.predict(x=tokens_pad)
coef = result[0][0]
if coef >= 0.5:
sed = '是一例正面评价, output=%.2f' % coef
else:
sed = '是一例负面评价, output=%.2f' % coef
print(sed)
return sed
server = socketserver.ThreadingTCPServer(('127.0.0.1', 9997), MyServer)
server.serve_forever()