コード例 #1
0
    def __init__(self, file_aim, file_res, file_dict, file_chi2_model,
                 file_tensor_model, num_classes):

        #file_aim="D:/python/data/data_tan_test_2.txt"
        #file_tensor_model="D:/python/model/tensorflow/model_tan_1.ckpt-20"
        #file_res="D:/python/data/res_tan_test_2.txt"
        file_stopwd = "D:/python/data/stopwd.txt"
        #file_dict="D:/python/data/dict_tan.pkl"

        stopwdlist = data_handler.stopwordslist(file_stopwd)
        pkl_file = open(file_dict, 'rb')
        dict = pickle.load(pkl_file)
        data = []
        data.extend(data_handler.data_tovec(file_aim, [0, 0], dict,
                                            stopwdlist))
        print(len(data))
        data = np.array(data)
        pkl_file2 = open(file_chi2_model, 'rb')
        model1 = pickle.load(pkl_file2)
        data_x = model1.transform(list(data[:, 0]))
        #data_x=list(data[:,0])

        print("dict len", len(dict))

        def write_res(res, res_file_path, aim_file_path):
            with open(res_file_path, "a+", encoding='UTF-8') as res_f:
                with open(aim_file_path, "r+", encoding='UTF-8') as aim_f:
                    lines = aim_f.readlines()
                    for num, line in enumerate(lines):
                        if res[num] == 0:
                            res_f.write("正向  " + line + "\n")
                        if res[num] == 1:
                            res_f.write("负向  " + line + "\n")
                        if res[num] == 2:
                            res_f.write("中性  " + line + "\n")

        with tf.Session() as session:
            new_saver = tf.train.import_meta_graph(file_tensor_model + ".meta")
            new_saver.restore(session, file_tensor_model)
            predict = tf.get_collection('predict')[0]
            graph = tf.get_default_graph()
            X = graph.get_operation_by_name('X').outputs[0]
            Y = graph.get_operation_by_name('Y').outputs[0]

            res = session.run(tf.argmax(predict, 1),
                              feed_dict={X: list(data_x)})
            write_res(res, file_res, file_aim)
            print("结果写入:%s" % file_res)
            tf.reset_default_graph()
コード例 #2
0
	def __init__ (
	  self,file_aim,file_res,file_tensor_model,num_classes):
		file_stopwd="D:/python/data/stopwd.txt"
		#file_aim="D:/python/data/data_tan_own.txt"
		#file_res="D:/python/data/res_cnn_own_1.txt"
		#file_tensor_model="D:/python/model/tensorflow/model_data_cnn_1.ckpt-20"
		sentence_length=200

		stopwdlist=data_handler.stopwordslist(file_stopwd)
		test_data_x,test_data_y=data_handler.data_tovec_w2v(file_aim,[0,0],sentence_length,stopwdlist)
		test_data_x=np.array(test_data_x)

		def write_res(res,res_file_path,aim_file_path):
			with open(res_file_path,"a+",encoding='UTF-8') as res_f:
				with open(aim_file_path,"r+",encoding='UTF-8') as aim_f:
					lines=aim_f.readlines()
					for num,line in enumerate(lines):
						if res[num]==0:
							res_f.write("正向  "+line+"\n")
						if res[num]==1:
							res_f.write("负向  "+line+"\n")
						if res[num]==2:
							res_f.write("中性  "+line+"\n")

		# Training
		# ==================================================
		with tf.Session() as sess:
			new_saver = tf.train.import_meta_graph(file_tensor_model+".meta")  
			new_saver.restore(sess, file_tensor_model)   
			predictions = tf.get_collection('predictions')[0]  
			graph = tf.get_default_graph()
			input_x = graph.get_operation_by_name('input_x').outputs[0]
			input_y = graph.get_operation_by_name('input_y').outputs[0]
			dropout_keep_prob=graph.get_operation_by_name('dropout_keep_prob').outputs[0]

			res=sess.run(predictions, feed_dict={input_x:test_data_x,dropout_keep_prob:1.0})
			write_res(res,file_res,file_aim)
			print("结果写入:%s"%file_res)
コード例 #3
0
import random
import pickle
import data_handler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

file_pos = "D:/python/data/data_tan_pos.txt"
file_neg = "D:/python/data/data_tan_neg.txt"
#file_mid="D:/python/data/douban_data_m.txt"
file_stopwd = "D:/python/data/stopwd.txt"
file_dict = "D:/python/data/dict_class_2.pkl"
file_tensor_model = "D:/python/model/tensorflow/model_bp_class2.ckpt"
file_chi2_model = "D:/python/data/model_chi2_bp2.pkl"
stopwdlist = data_handler.stopwordslist(file_stopwd)


def save_target(target, file_path):
    output = open(file_path, 'wb')
    pickle.dump(target, output, -1)
    output.close()


data_befvec = data_handler.data_prevocab(file_pos, stopwdlist)
data_befvec += data_handler.data_prevocab(file_neg, stopwdlist)
#data_befvec+=data_handler.data_prevocab(file_mid,stopwdlist)
dict = data_handler.build_vocab(data_befvec, 5)
save_target(dict, file_dict)
print(len(dict))
コード例 #4
0
    def __init__(self,
                 file_pos,
                 file_neg,
                 file_aim,
                 file_res,
                 num_classes,
                 file_mid=""):

        #file_pos="D:/python/data/data_tan_pos_s.txt"
        #file_neg="D:/python/data/data_tan_neg_s.txt"
        file_stopwd = "D:/python/data/stopwd.txt"
        #file_aim="D:/python/data/data_tan_test_2.txt"
        #file_res="D:/python/data/res_tan_cnn_1.txt"
        sentence_length = 0

        stopwdlist = data_handler.stopwordslist(file_stopwd)
        sentence_length = data_handler.max_sentence(file_pos, sentence_length,
                                                    stopwdlist)
        sentence_length = data_handler.max_sentence(file_neg, sentence_length,
                                                    stopwdlist)
        if num_classes == 2:
            data_x_pos, data_y_pos = data_handler.data_tovec_w2v(
                file_pos, [1, 0], sentence_length, stopwdlist)
            data_x_neg, data_y_neg = data_handler.data_tovec_w2v(
                file_neg, [0, 1], sentence_length, stopwdlist)
            data_raw_x = data_x_pos + data_x_neg
            data_raw_y = data_y_pos + data_y_neg
            del data_x_neg, data_y_neg, data_x_pos, data_y_pos
        else:
            sentence_length = data_handler.max_sentence(
                file_mid, sentence_length, stopwdlist)
            data_x_pos, data_y_pos = data_handler.data_tovec_w2v(
                file_pos, [1, 0, 0], sentence_length, stopwdlist)
            data_x_neg, data_y_neg = data_handler.data_tovec_w2v(
                file_neg, [0, 1, 0], sentence_length, stopwdlist)
            data_x_mid, data_y_mid = data_handler.data_tovec_w2v(
                file_mid, [0, 0, 1], sentence_length, stopwdlist)
            data_raw_x = data_x_pos + data_x_neg + data_x_mid
            data_raw_y = data_y_pos + data_y_neg + data_y_mid
            del data_x_neg, data_y_neg, data_x_pos, data_y_pos, data_x_mid, data_y_mid
        #print(data)
        print(len(data_raw_x))
        data_raw_x = np.array(data_raw_x)
        data_raw_y = np.array(data_raw_y)
        shuffle_indices = np.random.permutation(np.arange(len(data_raw_x)))
        train_data_x = data_raw_x[shuffle_indices]
        del data_raw_x
        train_data_y = data_raw_y[shuffle_indices]
        del data_raw_y

        test_data_x, test_data_y = data_handler.data_tovec_w2v(
            file_aim, [0, 0], sentence_length, stopwdlist)
        test_data_x = np.array(test_data_x)

        # Training
        # ==================================================
        with tf.Session() as sess:
            sequence_length = sentence_length
            num_classes = num_classes
            embedding_size = 400
            #filter_sizes=list(map(int, filter_sizes.split(",")))
            filter_sizes = {3, 4, 5}
            num_filters = 64
            l2_reg_lambda = 0.0

            input_x = tf.placeholder(tf.float32,
                                     [None, sequence_length, embedding_size],
                                     name="input_x")
            input_y = tf.placeholder(tf.float32, [None, num_classes],
                                     name="input_y")
            dropout_keep_prob = tf.placeholder(tf.float32,
                                               name="dropout_keep_prob")

            l2_loss = tf.constant(0.0)

            embedded_chars_expanded = tf.expand_dims(input_x, -1)

            pooled_outputs = []
            for i, filter_size in enumerate(filter_sizes):
                #卷积
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1),
                                name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]),
                                name="b")
                conv = tf.nn.conv2d(embedded_chars_expanded,
                                    W,
                                    strides=[1, 1, 1, 1],
                                    padding="VALID",
                                    name="conv")
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                #maxpooling
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID',
                    name="pool")
                pooled_outputs.append(pooled)

            num_filters_total = num_filters * len(filter_sizes)
            h_pool = tf.concat(pooled_outputs, 3)
            h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
            h_drop = tf.nn.dropout(h_pool_flat, dropout_keep_prob)

            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            scores = tf.nn.xw_plus_b(h_drop, W, b, name="scores")
            predictions = tf.argmax(scores, 1, name="predictions")

            losses = tf.nn.softmax_cross_entropy_with_logits(logits=scores,
                                                             labels=input_y)
            loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

            correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"),
                                      name="accuracy")

            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            sess.run(tf.global_variables_initializer())
            epochs = 20
            batch_size = 1000
            for epoch in range(epochs):
                i = 0
                while i < len(train_data_x):
                    start = i
                    end = i + batch_size
                    batch_x = train_data_x[start:end]
                    batch_y = train_data_y[start:end]
                    feed_dict = {
                        input_x: batch_x,
                        input_y: batch_y,
                        dropout_keep_prob: 0.5
                    }
                    _, step_r, loss_r, accuracy_r = sess.run(
                        [train_op, global_step, loss, accuracy], feed_dict)
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step_r, loss_r, accuracy_r))
                    i = end
                print("epoch: {}", epoch)

            res = sess.run(predictions,
                           feed_dict={
                               input_x: test_data_x,
                               input_y: test_data_y,
                               dropout_keep_prob: 1.0
                           })
            with open(file_res, "a+", encoding='UTF-8') as res_f:
                with open(file_aim, "r+", encoding='UTF-8') as aim_f:
                    lines = aim_f.readlines()
                    for num, line in enumerate(lines):
                        if res[num] == 0:
                            res_f.write("正向  " + line + "\n")
                        if res[num] == 1:
                            res_f.write("负向  " + line + "\n")
            print("结果写入:%s" % file_res)
コード例 #5
0
    def __init__(self,
                 file_pos,
                 file_neg,
                 file_aim,
                 file_res,
                 num_classes,
                 file_mid=""):

        #file_pos="D:/python/data/data_tan_pos.txt"
        #file_neg="D:/python/data/data_tan_neg.txt"
        #file_aim="D:/python/data/data_tan_test_2.txt"
        #file_res="D:/python/data/res_tan_2.txt"
        #file_mid="D:/python/data/data_mid.txt"
        file_stopwd = "D:/python/data/stopwd.txt"

        stopwdlist = data_handler.stopwordslist(file_stopwd)
        data_befvec = data_handler.data_prevocab(file_pos, stopwdlist)
        data_befvec += data_handler.data_prevocab(file_neg, stopwdlist)
        data_befvec += data_handler.data_prevocab(file_aim, stopwdlist)
        if num_classes == 3:
            data_befvec += data_handler.data_prevocab(file_mid, stopwdlist)
        len_aim = data_handler.count_lines(file_aim)
        dict = data_handler.build_vocab(data_befvec, 5)
        print(len(dict))

        data = []
        data_test = []
        if num_classes == 2:
            data.extend(
                data_handler.data_tovec(file_pos, [1, 0], dict, stopwdlist))
            data.extend(
                data_handler.data_tovec(file_neg, [0, 1], dict, stopwdlist))
            random.shuffle(data)
            data_test.extend(
                data_handler.data_tovec(file_aim, [0, 0], dict, stopwdlist))
        else:
            data.extend(
                data_handler.data_tovec(file_pos, [1, 0, 0], dict, stopwdlist))
            data.extend(
                data_handler.data_tovec(file_neg, [0, 1, 0], dict, stopwdlist))
            data.extend(
                data_handler.data_tovec(file_mid, [0, 0, 1], dict, stopwdlist))
            random.shuffle(data)
            data_test.extend(
                data_handler.data_tovec(file_aim, [0, 0, 0], dict, stopwdlist))
        print(len(data))

        data = np.array(data)
        data_test = np.array(data_test)
        model1 = SelectKBest(chi2, k=400)
        train_data_x = model1.fit_transform(list(data[:, 0]), list(data[:, 1]))
        train_data_y = list(data[:, 1])
        test_data_x = model1.transform(list(data_test[:, 0]))
        test_data_y = data_test[:, 1]

        #train_data_x = data_x[:-test_size]
        #train_data_y = data_y[:-test_size]
        #test_data_x = data_x[-test_size:]
        #test_data_y = data_y[-test_size:]

        #神经网络定义及训练(双隐层网络)

        n_input_layer = 400  #输入向量维度
        n_layer_1 = 400
        n_output_layer = num_classes

        def define_layer(input, input_n, output_n):  #添加一个神经网络层
            weight = tf.Variable(tf.random_normal([input_n, output_n]))
            baise = tf.Variable(tf.random_normal([output_n]))
            layer = tf.matmul(input, weight) + baise
            return layer

        #定义待训练的神经网络

        def define_network(data):
            layer_1 = define_layer(data, n_input_layer, n_layer_1)
            layer_1 = tf.nn.relu(layer_1)
            layer_output = define_layer(layer_1, n_layer_1, n_output_layer)
            return layer_output

        batch_size = 20
        X = tf.placeholder('float', [None, n_input_layer], name='X')  #占位符
        Y = tf.placeholder('float', name='Y')

        #使用数据训练神经网络
        def train_neural_network(X, Y):
            predict = define_network(X)  #定义神经网络
            reg = tf.contrib.layers.apply_regularization(
                tf.contrib.layers.l2_regularizer(1e-4),
                tf.trainable_variables())
            cost_func = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(
                    logits=predict, labels=Y)) + reg  #定义代价函数,这里用交叉熵损失实现
            optimizer = tf.train.AdamOptimizer().minimize(cost_func)  #adam优化器
            #optimizer=tf.train.GradientDescentOptimizer(0.1).minimize(cost_func)

            epochs = 20  #迭代周期
            with tf.Session() as session:
                session.run(tf.global_variables_initializer())  #tensorflow初始化

                print('训练集数据量 {}'.format(len(train_data_x)))
                for epoch in range(epochs):
                    epoch_loss = 0  #每个周期的loss
                    i = 0
                    while i < len(train_data_x):
                        start = i
                        end = i + batch_size
                        batch_x = train_data_x[start:end]
                        batch_y = train_data_y[start:end]
                        _, c = session.run([optimizer, cost_func],
                                           feed_dict={
                                               X: batch_x,
                                               Y: batch_y
                                           })
                        epoch_loss += c
                        i = end
                    print('迭代次数', epoch, ' : 损失函数', epoch_loss)

                res = session.run(tf.argmax(predict, 1),
                                  feed_dict={
                                      X: list(test_data_x),
                                      Y: list(test_data_y)
                                  })

                with open(file_res, "a+", encoding='UTF-8') as res_f:
                    with open(file_aim, "r+", encoding='UTF-8') as aim_f:
                        lines = aim_f.readlines()
                        for num, line in enumerate(lines):
                            if res[num] == 0:
                                res_f.write("正向  " + line + "\n")
                            if res[num] == 1:
                                res_f.write("负向  " + line + "\n")
                            if res[num] == 2:
                                res_f.write("中性  " + line + "\n")
                print("结果写入:%s" % file_res)

        train_neural_network(X, Y)