def process_text(text): glossary, word2id = reader.read_glossary() for index, item in enumerate(text): temp = list(jieba.cut(item[0])) for jndex, jtem in enumerate(temp): if jtem in [' ', '\u3000', ' ']: temp.remove(jtem) for jndex, jtem in enumerate(temp): word = jtem.lower() word = re.sub(r'[0-9]+', '^数', word) if word not in glossary: word = '^替' temp[jndex] = word temp.append('^终') num = len(temp) text[index][0] = [temp, num] max_len = max([i[0][1] for i in text]) for index, item in enumerate(text): while (len(item[0][0])) < max_len: item[0][0].append('^填') for item in text: item[0][0] = [word2id[word] for word in item[0][0]] with open('data/corpus/train_2', 'wb') as fp: pickle.dump(text, fp)
def get_word_sentiment_polarity(words): vocab, word2id = reader.read_glossary() words = [[word2id[word]] for word in words if word in vocab] with tf.Graph().as_default(), tf.Session() as sess: model = wm( glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, ) model.buildGraph() saver = tf.train.Saver(tf.trainable_variables()) sp = [] if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir): test_feed_dict = {model.inputs: words} raw_expection, expection = sess.run( [model.raw_expection, model.expection], feed_dict=test_feed_dict) for index in range(len(words)): temp = [vocab[words[index]][0]] print(vocab[words[index]], end='\t') print(raw_expection[index][0], end='\t') temp.append(raw_expection[index][0]) if (expection[index][0] == 1): print('负面') temp.append('负面') elif (expection[index][0] == 0): print('非负面') temp.append('非负面') sp.append(temp) return sp
def test_onesent(text): corpus = reader.preprocess([[text]], seq_lenth=FLAGS.seq_lenth, seq_num=1, overlap_lenth=0, input_label=False, output_index=False) vocab, word2id = reader.read_glossary() print(corpus) test_inputs = [] test_lenths = [] test_num = 0 for item in corpus: test_inputs.append(item[0]) test_lenths.append(item[1]) test_num += 1 with tf.Graph().as_default(), tf.Session() as sess: model = em_sent(seq_size=FLAGS.seq_lenth, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth, is_training=False) model.buildTrainGraph() saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10) if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir): test_feed_dict = { model.inputs: test_inputs, model.lenths: test_lenths, model.lenths_weight: padded_ones_list_like(test_lenths, FLAGS.seq_lenth), } expection, alpha, logits = sess.run( [model.expection, model.alpha, model.logits], feed_dict=test_feed_dict) print([vocab[i] for i in test_inputs[0]]) for i in range(len(test_inputs[0])): print(vocab[test_inputs[0][i]], alpha[0][i], logits[0][i]) # print([vocab[word] for word in test_inputs]) if (expection[0][0] == 1): print('负面') else: print('正面') return expection[0]
def process_double_text(text): glossary, word2id = reader.read_glossary() new_text = [] for index, item in enumerate(text): new_temp = [[], item[2]] temp = list(jieba.cut(item[0])) temp_ = list(jieba.cut(item[1])) for jndex, jtem in enumerate(temp): if jtem in [' ', '\u3000', ' ']: temp.remove(jtem) for jndex, jtem in enumerate(temp): word = jtem.lower() word = re.sub(r'[0-9]+', '^数', word) if word not in glossary: word = '^替' temp[jndex] = word temp.append('^终') for jndex, jtem in enumerate(temp_): if jtem in [' ', '\u3000', ' ']: temp_.remove(jtem) for jndex, jtem in enumerate(temp_): word = jtem.lower() word = re.sub(r'[0-9]+', '^数', word) if word not in glossary: word = '^替' temp_[jndex] = word temp_.append('^终') temp.extend(temp_) num = len(temp) new_temp[0] = [temp, num] new_text.append(new_temp) max_len = max([i[0][1] for i in new_text]) for index, item in enumerate(new_text): while (len(item[0][0])) < max_len: item[0][0].append('^填') with open('data/corpus/train_2_oigin', 'wb') as fp: pickle.dump(new_text, fp) for item in new_text: item[0][0] = [word2id[word] for word in item[0][0]] with open('data/corpus/train_2', 'wb') as fp: pickle.dump(new_text, fp)
def test_onesent(sess, sent): import jieba import re glossary, word2id = reader.read_glossary() temp = list(jieba.cut(sent)) for index, item in enumerate(temp): if item in [' ', '\u3000', ' ']: temp.remove(item) for index, item in enumerate(temp): word = item.lower() word = re.sub(r'[0-9]+', '^数', word) if word not in glossary: word = '^替' temp[index] = word temp.append('^终') num = len(temp) while (len(temp)) < 150: temp.append('^填') print(sent) sent = np.array([word2id[item] for item in temp]) print(sent) print([glossary[i] for i in sent]) model = EncoderModel(batch_size=FLAGS.batch_size, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth) model.build_test_graph(150) saver = tf.train.Saver() if restore_from_checkpoint(sess, saver, 'save/pt_bi_lstm_attn/2'): result, alpha = sess.run([model.expection, model.alpha], feed_dict={ model.test_inputs: [sent], model.test_lenth: [num] }) print("Predict result: ") print(alpha) print(result[0])
def main(_): glossary_size = 35156 embedding_size = 400 attn_lenth = 350 with tf.Graph().as_default(), tf.Session() as sess: with tf.name_scope('embeddings'), tf.variable_scope('embeddings'): # embeddings = tf.Variable(pretrained_wv, name='embeddings') embeddings = tf.Variable(tf.truncated_normal( [glossary_size, embedding_size], stddev=0.1), name='embeddings') with tf.name_scope('attention'), tf.variable_scope('attention'): u1_w = tf.Variable(tf.truncated_normal( [embedding_size, attn_lenth], stddev=0.1), name='attention_w') u1_b = tf.Variable(tf.constant(0.1, shape=[attn_lenth]), name='attention_b') u2_w = tf.Variable(tf.truncated_normal([attn_lenth, 1], stddev=0.1), name='attention_u') attned_1 = tf.matmul(tf.nn.relu(tf.matmul(embeddings, u1_w) + u1_b), u2_w) # attned_2 = tf.nn.softmax(attned_1, dim=relu) saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10) if restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir): result = [i[0] for i in sess.run(attned_1)] glossary, _ = reader.read_glossary() g_r = [(k, v) for k, v in zip(glossary, result)] g_r = sorted(g_r, key=lambda x: x[1]) f = open('words.txt', 'w', encoding='utf8') for i in range(len(glossary)): f.write(g_r[i][0] + '\t' + str(g_r[i][1]) + '\n') else: print('errrrrror@@')
def test_sent(sess): # _, _, test_corpus = reader.read_corpus(index='1_0.2', pick_train=False, pick_valid=False, pick_test=True) # test_corpus, _, _ = reader.read_corpus(index=0, pick_train=True, pick_valid=False, pick_test=False) _, _, test_corpus = reader.read_corpus(index='yhwc_150', pick_train=False, pick_valid=False, pick_test=True) glossary, word2id = reader.read_glossary() test_inputs = [] test_lenth = [] test_labels = [] test_num = 0 for item in test_corpus: test_inputs.append(item[0][0]) test_lenth.append(int(item[0][1])) if item[1] in [1, 'T', 1.0]: test_labels.append(1) elif item[1] in [0, 'F', 0.0]: test_labels.append(0) test_num += 1 model = em_sent( batch_size=FLAGS.batch_size, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth ) model.build_test_graph(150) saver = tf.train.Saver() test_labels = np.reshape(test_labels, [test_num, 1]) if restore_from_checkpoint(sess, saver, 'save/pt-bi-lstm-attn'): # test_loss, accuracy, expection, w2v, alpha = sess.run( # [model.test_loss, model.test_accuracy, model.expection, model.embeddings, model.alpha], # feed_dict=test_feed_dict) total_test_loss = 0 total_accuracy = 0 total_expection = [] # threshold = 0.9 for piece_inputs, piece_lenth, piece_labels in get_test_batches(test_inputs, test_lenth, test_labels, test_num): piece_num = len(piece_inputs) test_feed_dict = { model.test_inputs: piece_inputs, model.test_lenth: piece_lenth, model.test_labels: piece_labels } test_loss, accuracy, expection, w2v = sess.run( [model.test_loss, model.test_accuracy, model.expection, model.embeddings], feed_dict=test_feed_dict) total_test_loss += test_loss * piece_num total_accuracy += accuracy * piece_num total_expection.extend(expection) # for i in range(len(expection)): # if expection[i] < threshold: # logit = 0 # else: # logit = relu # total_expection.append(logit) # if logit == piece_labels[i]: # total_accuracy += relu total_test_loss /= test_num total_accuracy /= test_num for i in range(test_num): print(i, [glossary[word] for word in test_inputs[i]]) print(test_inputs[i]) # print(alpha[i]) print(test_labels[i], total_expection[i]) def f_value(): # 真正例 TP = 0 # 假正例 FP = 0 # 假反例 FN = 0 # 真反例 TN = 0 # We pay more attention on negative samples. for i in range(test_num): if test_labels[i] == 0 and total_expection[i] == 0: TP += 1 elif test_labels[i] == 0 and total_expection[i] == 1: FN += 1 elif test_labels[i] == 1 and total_expection[i] == 0: FP += 1 elif test_labels[i] == 1 and total_expection[i] == 1: TN += 1 P = TP / (TP + FP + 0.0001) R = TP / (TP + FN + 0.0001) F = 2 * P * R / (P + R + 0.0001) P_ = TN / (TN + FN + 0.0001) R_ = TN / (TN + FP + 0.0001) F_ = 2 * P_ * R_ / (P_ + R_ + 0.0001) ACC = (TP + TN) / (TP + FP + TN + FN + 0.0001) print("Validation: Average loss: {};".format(total_test_loss)) print(" accuracy rate: {:.4f}".format(total_accuracy)) print("About negative samples:") print(" precision rate: {:.4f}".format(P)) print(" recall rate: {:.4f}".format(R)) print(" f-value: {:.4f}".format(F)) print("About positive samples:") print(" precision rate: {:.4f}".format(P_)) print(" recall rate: {:.4f}".format(R_)) print(" f-value: {:.4f}".format(F_)) f_value() return total_expection else: print("error!")
import pickle import re import jieba import reader VOCABULARY, WORD2ID = reader.read_glossary() def splitSentence(text): list_temp = [] str_temp = [] last_special_flag = None quotes_flag = False def _add(_word, _seq): _seq.append(_word) return _seq def _new(_word, _seq, add_after_push=False): if _seq == []: _seq.append(_word) else: if add_after_push is True: _seq.append(_word) list_temp.append(''.join(_seq)) _seq = [] else: list_temp.append(''.join(_seq)) _seq = [_word] return _seq
def train(sess): # Pretreatment print("Read file --") start = time.time() id2word, word2id = reader.read_glossary() train_corpus, valid_corpus, _ = reader.read_corpus(index='0', pick_test=False) pretrained_wv = reader.read_initw2v() train_inputs = [] train_lenth = [] train_labels = [] train_num = 0 for item in train_corpus: train_inputs.append(item[1]) train_lenth.append(int(item[2])) train_labels.append(1 if item[0] is 'T' else 0) train_num += 1 valid_inputs = [] valid_lenth = [] valid_labels = [] valid_num = 0 for item in valid_corpus: valid_inputs.append(item[1]) valid_lenth.append(int(item[2])) valid_labels.append(1 if item[0] is 'T' else 0) valid_num += 1 end = time.time() print("Read finished -- {:.4f} sec".format(end - start)) # Build model print("Building model --") start = end model = EncoderModel(batch_size=FLAGS.batch_size, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth) model.build_train_graph() model.build_validate_graph(valid_num) init = tf.global_variables_initializer() sess.run(init, feed_dict={model.pretrained_wv: pretrained_wv}) # sess.run(init) saver = tf.train.Saver(max_to_keep=10) train_writer = tf.summary.FileWriter(logdir=FLAGS.tensorboard_dir, graph=sess.graph) end = time.time() print("Building model finished -- {:.4f} sec".format(end - start)) # if not restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir): # return step_global = 0 sum_loss = 0 sum_dev_loss = 0 sum_acc_t = 0 sum_acc_d = 0 # max_acc = 0 lr = 0.001 valid_labels = np.reshape(valid_labels, [valid_num, 1]) dev_feed_dict = { model.dev_inputs: valid_inputs, model.dev_lenth: valid_lenth, model.dev_labels: valid_labels } print("Training initialized") start = time.time() for inputs, lenth, labels in get_batches(train_inputs, train_lenth, train_labels, train_num): step_global += 1 labels = np.reshape(labels, [FLAGS.batch_size, 1]) feed_dict = { model.inputs: inputs, model.lenth: lenth, model.labels: labels, model.learning_rate: lr } loss, _, t_scalar, t_acc = sess.run([ model.loss, model.optimizer, model.train_scalar, model.train_accuracy ], feed_dict=feed_dict) dev_loss, d_scalar, d_acc, w2v = sess.run([ model.dev_loss, model.dev_scalar, model.dev_accuracy, model.embeddings ], feed_dict=dev_feed_dict) sum_loss += loss sum_dev_loss += dev_loss sum_acc_t += t_acc sum_acc_d += d_acc def eval_ws(ws_list): from scipy import stats from numpy import linalg as LA logits = [] real = [] eval = [] for iter_ws in ws_list: if iter_ws[0] not in id2word or iter_ws[1] not in id2word: continue else: A = word2id[iter_ws[0]] B = word2id[iter_ws[1]] real.append(iter_ws[2]) logits.extend([w2v[A], w2v[B]]) for i in range(len(logits) // 2): A_vec = logits[2 * i] B_vec = logits[2 * i + 1] normed_A_vec = LA.norm(A_vec, axis=0) normed_B_vec = LA.norm(B_vec, axis=0) sim = sum(np.multiply(A_vec, B_vec)) eval.append(sim / normed_A_vec / normed_B_vec) pearsonr = stats.pearsonr(real, eval)[0] spearmanr = stats.spearmanr(real, eval).correlation return pearsonr, spearmanr if step_global % FLAGS.save_every_n == 0: end = time.time() print( "Training: Average loss at step {}: {};".format( step_global, sum_loss / FLAGS.save_every_n), "time: {:.4f} sec;".format(end - start), "accuracy rate: {:.4f}".format(sum_acc_t / FLAGS.save_every_n)) print( "Validation: Average loss: {};".format(sum_dev_loss / FLAGS.save_every_n), "accuracy rate: {:.4f}".format(sum_acc_d / FLAGS.save_every_n)) saver.save(sess, FLAGS.ckpt_dir + "/step{}.ckpt".format(step_global)) train_writer.add_summary(t_scalar, step_global) train_writer.add_summary(d_scalar, step_global) ac_scalar = tf.Summary(value=[ tf.Summary.Value(tag="accuracy rate", simple_value=sum_acc_d / FLAGS.save_every_n) ]) train_writer.add_summary(ac_scalar, step_global) # p_240, s_240 = eval_ws(reader.read_wordsim240()) # p_297, s_297 = eval_ws(reader.read_wordsim297()) # p_240_scalar = tf.Summary(value=[tf.Summary.Value(tag="ws240 pearsonr rate", simple_value=p_240)]) # s_240_scalar = tf.Summary(value=[tf.Summary.Value(tag="ws240 spearmanr rate", simple_value=s_240)]) # p_297_scalar = tf.Summary(value=[tf.Summary.Value(tag="ws297 pearsonr rate", simple_value=p_297)]) # s_297_scalar = tf.Summary(value=[tf.Summary.Value(tag="ws297 spearmanr rate", simple_value=s_297)]) # print("eval_ws240:") # print('pearsonr:%s' % p_240) # print('spearmanr:%s' % s_240) # print("eval_ws297:") # print('pearsonr:%s' % p_297) # print('spearmanr:%s' % s_297) # train_writer.add_summary(p_240_scalar, step_global) # train_writer.add_summary(s_240_scalar, step_global) # train_writer.add_summary(p_297_scalar, step_global) # train_writer.add_summary(s_297_scalar, step_global) sum_loss = 0 sum_dev_loss = 0 sum_acc_t = 0 sum_acc_d = 0 start = time.time()
def test(sess): # _, _, test_corpus = reader.read_corpus(index='1_0.2', pick_train=False, pick_valid=False, pick_test=True) # test_corpus, _, _ = reader.read_corpus(index=0, pick_train=True, pick_valid=False, pick_test=False) _, _, test_corpus = reader.read_corpus(index='klb_150', pick_train=False, pick_valid=False, pick_test=True) glossary, word2id = reader.read_glossary() test_inputs = [] test_lenth = [] test_labels = [] test_num = 0 for item in test_corpus: # test_inputs.append(item[1]) # test_lenth.append(item[2]) # if item[0] in [1, 'T', 1.0]: # test_labels.append(1) # elif item[0] in [0, 'F', 0.0]: # test_labels.append(0) # test_num += 1 test_inputs.append(item[0][0]) test_lenth.append(int(item[0][1])) if item[1] in [1, 'T', 1.0]: test_labels.append(1) elif item[1] in [0, 'F', 0.0]: test_labels.append(0) test_num += 1 model = EncoderModel(batch_size=FLAGS.batch_size, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth) model.build_test_graph(150) saver = tf.train.Saver() test_labels = np.reshape(test_labels, [test_num, 1]) if restore_from_checkpoint(sess, saver, 'save/pt_bi_lstm_attn/1'): # test_loss, accuracy, expection, w2v, alpha = sess.run( # [model.test_loss, model.test_accuracy, model.expection, model.embeddings, model.alpha], # feed_dict=test_feed_dict) total_test_loss = 0 total_accuracy = 0 total_expection = [] for piece_inputs, piece_lenth, piece_labels in get_test_batches( test_inputs, test_lenth, test_labels, test_num): piece_num = len(piece_inputs) test_feed_dict = { model.test_inputs: piece_inputs, model.test_lenth: piece_lenth, model.test_labels: piece_labels } test_loss, accuracy, expection, w2v = sess.run( [ model.test_loss, model.test_accuracy, model.expection, model.embeddings ], feed_dict=test_feed_dict) total_test_loss += test_loss * piece_num total_accuracy += accuracy * piece_num total_expection.extend(expection) total_test_loss /= test_num total_accuracy /= test_num for i in range(test_num): print(i, [glossary[word] for word in test_inputs[i]]) print(test_inputs[i]) # print(alpha[i]) print(test_labels[i], total_expection[i]) def f_value(): # 真正例 TP = 0 # 假正例 FP = 0 # 假反例 FN = 0 # 真反例 TN = 0 # We pay more attention on negative samples. for i in range(test_num): if test_labels[i] == 0 and total_expection[i] == 0: TP += 1 elif test_labels[i] == 0 and total_expection[i] == 1: FN += 1 elif test_labels[i] == 1 and total_expection[i] == 0: FP += 1 elif test_labels[i] == 1 and total_expection[i] == 1: TN += 1 P = TP / (TP + FP + 0.0001) R = TP / (TP + FN + 0.0001) F = 2 * P * R / (P + R + 0.0001) P_ = TN / (TN + FN + 0.0001) R_ = TN / (TN + FP + 0.0001) F_ = 2 * P_ * R_ / (P_ + R_ + 0.0001) print("Validation: Average loss: {};".format(total_test_loss)) print(" accuracy rate: {:.4f}".format(total_accuracy)) print("About negative samples:") print(" precision rate: {:.4f}".format(P)) print(" recall rate: {:.4f}".format(R)) print(" f-value: {:.4f}".format(F)) print("About positive samples:") print(" precision rate: {:.4f}".format(P_)) print(" recall rate: {:.4f}".format(R_)) print(" f-value: {:.4f}".format(F_)) def eval_ws(ws_list): from scipy import stats from numpy import linalg as LA logits = [] real = [] eval = [] for iter_ws in ws_list: if iter_ws[0] not in glossary or iter_ws[1] not in glossary: continue else: A = word2id[iter_ws[0]] B = word2id[iter_ws[1]] real.append(iter_ws[2]) logits.extend([w2v[A], w2v[B]]) for i in range(len(logits) // 2): A_vec = logits[2 * i] B_vec = logits[2 * i + 1] normed_A_vec = LA.norm(A_vec, axis=0) normed_B_vec = LA.norm(B_vec, axis=0) sim = sum(np.multiply(A_vec, B_vec)) eval.append(sim / normed_A_vec / normed_B_vec) # print(sim/normed_A_vec/normed_B_vec) print('pearsonr:%s' % (stats.pearsonr(real, eval)[0])) print('spearmanr:%s' % (stats.spearmanr(real, eval).correlation)) f_value() eval_ws(reader.read_wordsim240()) eval_ws(reader.read_wordsim297()) return total_expection else: print("error!")