def __init__(self, file_aim, file_res, file_dict, file_chi2_model, file_tensor_model, num_classes): #file_aim="D:/python/data/data_tan_test_2.txt" #file_tensor_model="D:/python/model/tensorflow/model_tan_1.ckpt-20" #file_res="D:/python/data/res_tan_test_2.txt" file_stopwd = "D:/python/data/stopwd.txt" #file_dict="D:/python/data/dict_tan.pkl" stopwdlist = data_handler.stopwordslist(file_stopwd) pkl_file = open(file_dict, 'rb') dict = pickle.load(pkl_file) data = [] data.extend(data_handler.data_tovec(file_aim, [0, 0], dict, stopwdlist)) print(len(data)) data = np.array(data) pkl_file2 = open(file_chi2_model, 'rb') model1 = pickle.load(pkl_file2) data_x = model1.transform(list(data[:, 0])) #data_x=list(data[:,0]) print("dict len", len(dict)) def write_res(res, res_file_path, aim_file_path): with open(res_file_path, "a+", encoding='UTF-8') as res_f: with open(aim_file_path, "r+", encoding='UTF-8') as aim_f: lines = aim_f.readlines() for num, line in enumerate(lines): if res[num] == 0: res_f.write("正向 " + line + "\n") if res[num] == 1: res_f.write("负向 " + line + "\n") if res[num] == 2: res_f.write("中性 " + line + "\n") with tf.Session() as session: new_saver = tf.train.import_meta_graph(file_tensor_model + ".meta") new_saver.restore(session, file_tensor_model) predict = tf.get_collection('predict')[0] graph = tf.get_default_graph() X = graph.get_operation_by_name('X').outputs[0] Y = graph.get_operation_by_name('Y').outputs[0] res = session.run(tf.argmax(predict, 1), feed_dict={X: list(data_x)}) write_res(res, file_res, file_aim) print("结果写入:%s" % file_res) tf.reset_default_graph()
def __init__ ( self,file_aim,file_res,file_tensor_model,num_classes): file_stopwd="D:/python/data/stopwd.txt" #file_aim="D:/python/data/data_tan_own.txt" #file_res="D:/python/data/res_cnn_own_1.txt" #file_tensor_model="D:/python/model/tensorflow/model_data_cnn_1.ckpt-20" sentence_length=200 stopwdlist=data_handler.stopwordslist(file_stopwd) test_data_x,test_data_y=data_handler.data_tovec_w2v(file_aim,[0,0],sentence_length,stopwdlist) test_data_x=np.array(test_data_x) def write_res(res,res_file_path,aim_file_path): with open(res_file_path,"a+",encoding='UTF-8') as res_f: with open(aim_file_path,"r+",encoding='UTF-8') as aim_f: lines=aim_f.readlines() for num,line in enumerate(lines): if res[num]==0: res_f.write("正向 "+line+"\n") if res[num]==1: res_f.write("负向 "+line+"\n") if res[num]==2: res_f.write("中性 "+line+"\n") # Training # ================================================== with tf.Session() as sess: new_saver = tf.train.import_meta_graph(file_tensor_model+".meta") new_saver.restore(sess, file_tensor_model) predictions = tf.get_collection('predictions')[0] graph = tf.get_default_graph() input_x = graph.get_operation_by_name('input_x').outputs[0] input_y = graph.get_operation_by_name('input_y').outputs[0] dropout_keep_prob=graph.get_operation_by_name('dropout_keep_prob').outputs[0] res=sess.run(predictions, feed_dict={input_x:test_data_x,dropout_keep_prob:1.0}) write_res(res,file_res,file_aim) print("结果写入:%s"%file_res)
import random import pickle import data_handler from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.feature_selection import mutual_info_classif file_pos = "D:/python/data/data_tan_pos.txt" file_neg = "D:/python/data/data_tan_neg.txt" #file_mid="D:/python/data/douban_data_m.txt" file_stopwd = "D:/python/data/stopwd.txt" file_dict = "D:/python/data/dict_class_2.pkl" file_tensor_model = "D:/python/model/tensorflow/model_bp_class2.ckpt" file_chi2_model = "D:/python/data/model_chi2_bp2.pkl" stopwdlist = data_handler.stopwordslist(file_stopwd) def save_target(target, file_path): output = open(file_path, 'wb') pickle.dump(target, output, -1) output.close() data_befvec = data_handler.data_prevocab(file_pos, stopwdlist) data_befvec += data_handler.data_prevocab(file_neg, stopwdlist) #data_befvec+=data_handler.data_prevocab(file_mid,stopwdlist) dict = data_handler.build_vocab(data_befvec, 5) save_target(dict, file_dict) print(len(dict))
def __init__(self, file_pos, file_neg, file_aim, file_res, num_classes, file_mid=""): #file_pos="D:/python/data/data_tan_pos_s.txt" #file_neg="D:/python/data/data_tan_neg_s.txt" file_stopwd = "D:/python/data/stopwd.txt" #file_aim="D:/python/data/data_tan_test_2.txt" #file_res="D:/python/data/res_tan_cnn_1.txt" sentence_length = 0 stopwdlist = data_handler.stopwordslist(file_stopwd) sentence_length = data_handler.max_sentence(file_pos, sentence_length, stopwdlist) sentence_length = data_handler.max_sentence(file_neg, sentence_length, stopwdlist) if num_classes == 2: data_x_pos, data_y_pos = data_handler.data_tovec_w2v( file_pos, [1, 0], sentence_length, stopwdlist) data_x_neg, data_y_neg = data_handler.data_tovec_w2v( file_neg, [0, 1], sentence_length, stopwdlist) data_raw_x = data_x_pos + data_x_neg data_raw_y = data_y_pos + data_y_neg del data_x_neg, data_y_neg, data_x_pos, data_y_pos else: sentence_length = data_handler.max_sentence( file_mid, sentence_length, stopwdlist) data_x_pos, data_y_pos = data_handler.data_tovec_w2v( file_pos, [1, 0, 0], sentence_length, stopwdlist) data_x_neg, data_y_neg = data_handler.data_tovec_w2v( file_neg, [0, 1, 0], sentence_length, stopwdlist) data_x_mid, data_y_mid = data_handler.data_tovec_w2v( file_mid, [0, 0, 1], sentence_length, stopwdlist) data_raw_x = data_x_pos + data_x_neg + data_x_mid data_raw_y = data_y_pos + data_y_neg + data_y_mid del data_x_neg, data_y_neg, data_x_pos, data_y_pos, data_x_mid, data_y_mid #print(data) print(len(data_raw_x)) data_raw_x = np.array(data_raw_x) data_raw_y = np.array(data_raw_y) shuffle_indices = np.random.permutation(np.arange(len(data_raw_x))) train_data_x = data_raw_x[shuffle_indices] del data_raw_x train_data_y = data_raw_y[shuffle_indices] del data_raw_y test_data_x, test_data_y = data_handler.data_tovec_w2v( file_aim, [0, 0], sentence_length, stopwdlist) test_data_x = np.array(test_data_x) # Training # ================================================== with tf.Session() as sess: sequence_length = sentence_length num_classes = num_classes embedding_size = 400 #filter_sizes=list(map(int, filter_sizes.split(","))) filter_sizes = {3, 4, 5} num_filters = 64 l2_reg_lambda = 0.0 input_x = tf.placeholder(tf.float32, [None, sequence_length, embedding_size], name="input_x") input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") l2_loss = tf.constant(0.0) embedded_chars_expanded = tf.expand_dims(input_x, -1) pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): #卷积 filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") conv = tf.nn.conv2d(embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") #maxpooling pooled = tf.nn.max_pool( h, ksize=[1, sequence_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs.append(pooled) num_filters_total = num_filters * len(filter_sizes) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total]) h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob) W = tf.get_variable( "W", shape=[num_filters_total, num_classes], initializer=tf.contrib.layers.xavier_initializer()) b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores") predictions = tf.argmax(scores, 1, name="predictions") losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores, labels=input_y) loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) sess.run(tf.global_variables_initializer()) epochs = 20 batch_size = 1000 for epoch in range(epochs): i = 0 while i < len(train_data_x): start = i end = i + batch_size batch_x = train_data_x[start:end] batch_y = train_data_y[start:end] feed_dict = { input_x: batch_x, input_y: batch_y, dropout_keep_prob: 0.5 } _, step_r, loss_r, accuracy_r = sess.run( [train_op, global_step, loss, accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step_r, loss_r, accuracy_r)) i = end print("epoch: {}", epoch) res = sess.run(predictions, feed_dict={ input_x: test_data_x, input_y: test_data_y, dropout_keep_prob: 1.0 }) with open(file_res, "a+", encoding='UTF-8') as res_f: with open(file_aim, "r+", encoding='UTF-8') as aim_f: lines = aim_f.readlines() for num, line in enumerate(lines): if res[num] == 0: res_f.write("正向 " + line + "\n") if res[num] == 1: res_f.write("负向 " + line + "\n") print("结果写入:%s" % file_res)
def __init__(self, file_pos, file_neg, file_aim, file_res, num_classes, file_mid=""): #file_pos="D:/python/data/data_tan_pos.txt" #file_neg="D:/python/data/data_tan_neg.txt" #file_aim="D:/python/data/data_tan_test_2.txt" #file_res="D:/python/data/res_tan_2.txt" #file_mid="D:/python/data/data_mid.txt" file_stopwd = "D:/python/data/stopwd.txt" stopwdlist = data_handler.stopwordslist(file_stopwd) data_befvec = data_handler.data_prevocab(file_pos, stopwdlist) data_befvec += data_handler.data_prevocab(file_neg, stopwdlist) data_befvec += data_handler.data_prevocab(file_aim, stopwdlist) if num_classes == 3: data_befvec += data_handler.data_prevocab(file_mid, stopwdlist) len_aim = data_handler.count_lines(file_aim) dict = data_handler.build_vocab(data_befvec, 5) print(len(dict)) data = [] data_test = [] if num_classes == 2: data.extend( data_handler.data_tovec(file_pos, [1, 0], dict, stopwdlist)) data.extend( data_handler.data_tovec(file_neg, [0, 1], dict, stopwdlist)) random.shuffle(data) data_test.extend( data_handler.data_tovec(file_aim, [0, 0], dict, stopwdlist)) else: data.extend( data_handler.data_tovec(file_pos, [1, 0, 0], dict, stopwdlist)) data.extend( data_handler.data_tovec(file_neg, [0, 1, 0], dict, stopwdlist)) data.extend( data_handler.data_tovec(file_mid, [0, 0, 1], dict, stopwdlist)) random.shuffle(data) data_test.extend( data_handler.data_tovec(file_aim, [0, 0, 0], dict, stopwdlist)) print(len(data)) data = np.array(data) data_test = np.array(data_test) model1 = SelectKBest(chi2, k=400) train_data_x = model1.fit_transform(list(data[:, 0]), list(data[:, 1])) train_data_y = list(data[:, 1]) test_data_x = model1.transform(list(data_test[:, 0])) test_data_y = data_test[:, 1] #train_data_x = data_x[:-test_size] #train_data_y = data_y[:-test_size] #test_data_x = data_x[-test_size:] #test_data_y = data_y[-test_size:] #神经网络定义及训练(双隐层网络) n_input_layer = 400 #输入向量维度 n_layer_1 = 400 n_output_layer = num_classes def define_layer(input, input_n, output_n): #添加一个神经网络层 weight = tf.Variable(tf.random_normal([input_n, output_n])) baise = tf.Variable(tf.random_normal([output_n])) layer = tf.matmul(input, weight) + baise return layer #定义待训练的神经网络 def define_network(data): layer_1 = define_layer(data, n_input_layer, n_layer_1) layer_1 = tf.nn.relu(layer_1) layer_output = define_layer(layer_1, n_layer_1, n_output_layer) return layer_output batch_size = 20 X = tf.placeholder('float', [None, n_input_layer], name='X') #占位符 Y = tf.placeholder('float', name='Y') #使用数据训练神经网络 def train_neural_network(X, Y): predict = define_network(X) #定义神经网络 reg = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(1e-4), tf.trainable_variables()) cost_func = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=predict, labels=Y)) + reg #定义代价函数,这里用交叉熵损失实现 optimizer = tf.train.AdamOptimizer().minimize(cost_func) #adam优化器 #optimizer=tf.train.GradientDescentOptimizer(0.1).minimize(cost_func) epochs = 20 #迭代周期 with tf.Session() as session: session.run(tf.global_variables_initializer()) #tensorflow初始化 print('训练集数据量 {}'.format(len(train_data_x))) for epoch in range(epochs): epoch_loss = 0 #每个周期的loss i = 0 while i < len(train_data_x): start = i end = i + batch_size batch_x = train_data_x[start:end] batch_y = train_data_y[start:end] _, c = session.run([optimizer, cost_func], feed_dict={ X: batch_x, Y: batch_y }) epoch_loss += c i = end print('迭代次数', epoch, ' : 损失函数', epoch_loss) res = session.run(tf.argmax(predict, 1), feed_dict={ X: list(test_data_x), Y: list(test_data_y) }) with open(file_res, "a+", encoding='UTF-8') as res_f: with open(file_aim, "r+", encoding='UTF-8') as aim_f: lines = aim_f.readlines() for num, line in enumerate(lines): if res[num] == 0: res_f.write("正向 " + line + "\n") if res[num] == 1: res_f.write("负向 " + line + "\n") if res[num] == 2: res_f.write("中性 " + line + "\n") print("结果写入:%s" % file_res) train_neural_network(X, Y)