def predict(input_path, output_path): df_i = pd.read_excel(input_path) corpus_i = df_i.iloc[:, [1]] corpus_i = np.array(corpus_i).tolist() corpus = reader.preprocess(reader.read_excel(input_path, text_column=1), seq_lenth=FLAGS.seq_lenth, seq_num=1, overlap_lenth=0, input_label=False, output_index=True) # vocab, word2id = reader.read_glossary() test_inputs = [] test_lenths = [] test_num = 0 for item in corpus: test_inputs.append(item[0]) test_lenths.append(item[1]) test_num += 1 with tf.Graph().as_default(), tf.Session() as sess: model = em_sent(seq_size=FLAGS.seq_lenth, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth, is_training=False) model.buildTrainGraph() saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10) if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir): total_expection = [] print(test_num) for piece_inputs, piece_lenths in get_test_batch( test_inputs, test_lenths, None, test_num, input_label=False): test_feed_dict = { model.inputs: piece_inputs, model.lenths: piece_lenths, model.lenths_weight: padded_ones_list_like(piece_lenths, FLAGS.seq_lenth), } expection = sess.run(model.expection, feed_dict=test_feed_dict) total_expection.extend(expection) zipped = [] for index in range(test_num): zipped.append([ corpus_i[corpus[index][2]], 'T' if total_expection[index][0] == 0 else 'F' ]) df_o = pd.DataFrame(zipped) writer = pd.ExcelWriter(output_path) df_o.to_excel(writer, 'Sheet1') writer.save()
def test_onesent(text): corpus = reader.preprocess([[text]], seq_lenth=FLAGS.seq_lenth, seq_num=1, overlap_lenth=0, input_label=False, output_index=False) vocab, word2id = reader.read_glossary() print(corpus) test_inputs = [] test_lenths = [] test_num = 0 for item in corpus: test_inputs.append(item[0]) test_lenths.append(item[1]) test_num += 1 with tf.Graph().as_default(), tf.Session() as sess: model = em_sent(seq_size=FLAGS.seq_lenth, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth, is_training=False) model.buildTrainGraph() saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10) if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir): test_feed_dict = { model.inputs: test_inputs, model.lenths: test_lenths, model.lenths_weight: padded_ones_list_like(test_lenths, FLAGS.seq_lenth), } expection, alpha, logits = sess.run( [model.expection, model.alpha, model.logits], feed_dict=test_feed_dict) print([vocab[i] for i in test_inputs[0]]) print([vocab[word] for word in test_inputs]) for i in range(len(test_inputs[0])): print(vocab[test_inputs[0][i]], alpha[0][i], logits[0][i]) if (expection[0][0] == 1): print('负面') else: print('正面') return expection[0]
def train(sess): # Pretreatment print("Read file --") start = time.time() # id2word, word2id = reader.read_glossary() train_corpus, _, _ = reader.read_corpus(index='relu', pick_valid=False, pick_test=False) pretrained_wv = reader.read_initw2v() end = time.time() print("Read finished -- {:.4f} sec".format(end-start)) # Build model print("Building model --") start = end # model = em_doc( # max_seq_size=120, # glossary_size=FLAGS.glossary_size, # embedding_size=FLAGS.embedding_size, # hidden_size=FLAGS.hidden_size, # attn_lenth=FLAGS.attn_lenth, # learning_rate=0.01 # ) model = em_sent( batch_size=FLAGS.batch_size, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth ) model.buildTrainGraph() init = tf.global_variables_initializer() sess.run(init, feed_dict={model.pretrained_wv: pretrained_wv}) sess.run(init) saver = tf.train.Saver(tf.trainable_variables(), # [ # model.embeddings, # model.lstm_fw_cell.weights, # model.lstm_bw_cell.weights, # model.attn_w, # model.attn_b, # model.attn_u, # model.inte_attn_w, # model.inte_attn_b, # model.inte_attn_u, # model.merge_inde_w, # model.merge_inde_b, # model.merge_inte_w, # model.merge_inte_b # ], max_to_keep=10) train_writer = tf.summary.FileWriter(logdir=FLAGS.tensorboard_dir, graph=sess.graph) end = time.time() print("Building model finished -- {:.4f} sec".format(end - start)) if not restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir): return step_global = 0 sum_loss = 0 # sum_dev_loss = 0 sum_acc_t = 0 # sum_acc_d = 0 # max_acc = 0 print("Training initialized") start = time.time() for epoch in range(FLAGS.epoches): for train_title_input, train_title_lenth, train_text_inputs, train_text_lenths, train_label in get_piece(train_corpus): step_global += 1 feed_dict = { model.title_input: train_title_input, model.title_lenth: train_title_lenth, model.text_inputs: train_text_inputs, model.text_lenths: train_text_lenths, model.label: train_label } loss, _, t_scalar, t_acc = sess.run([model.loss, model.optimizer, model.train_scalar, model.train_accuracy], feed_dict=feed_dict) # print(aaaa, bbbb, loss) sum_loss += loss sum_acc_t += t_acc # for dev_inputs, dev_lenth, dev_labels in get_batches(valid_inputs, valid_lenth, valid_labels, valid_num): # dev_feed_dict = { # model.dev_inputs: dev_inputs, # model.dev_lenth: dev_lenth, # model.dev_labels: dev_labels # } # dev_loss, d_scalar, d_acc, w2v = sess.run([model.dev_loss, # model.dev_scalar, # model.dev_accuracy, # model.embeddings], # feed_dict=dev_feed_dict) # sum_dev_loss += dev_loss # sum_acc_d += d_acc # # sum_dev_loss /= valid_num # sum_acc_d /= valid_num # def eval_ws(ws_list): # from scipy import stats # from numpy import linalg as LA # # logits = [] # real = [] # eval = [] # # for iter_ws in ws_list: # if iter_ws[0] not in id2word or iter_ws[relu] not in id2word: # continue # else: # A = word2id[iter_ws[0]] # B = word2id[iter_ws[relu]] # real.append(iter_ws[2]) # logits.extend([w2v[A], w2v[B]]) # # for i in range(len(logits) // 2): # A_vec = logits[2 * i] # B_vec = logits[2 * i + relu] # normed_A_vec = LA.norm(A_vec, axis=0) # normed_B_vec = LA.norm(B_vec, axis=0) # sim = sum(np.multiply(A_vec, B_vec)) # eval.append(sim / normed_A_vec / normed_B_vec) # # pearsonr = stats.pearsonr(real, eval)[0] # spearmanr = stats.spearmanr(real, eval).correlation # return pearsonr, spearmanr if step_global % FLAGS.save_every_n == 0: end = time.time() print("Training: Average loss at step {}: {};".format(step_global, sum_loss[0] / FLAGS.save_every_n), "time: {:.4f} sec;".format(end - start), "accuracy rate: {:.4f}".format(sum_acc_t[0] / FLAGS.save_every_n)) # print("Validation: Average loss: {};".format(sum_dev_loss / FLAGS.save_every_n), # "accuracy rate: {:.4f}".format(sum_acc_d / FLAGS.save_every_n)) saver.save(sess, FLAGS.ckpt_dir + "/step{}.ckpt".format(step_global)) train_writer.add_summary(t_scalar, step_global) # ac_scalar = tf.Summary(value=[tf.Summary.Value(tag="accuracy rate", simple_value=sum_acc_d / FLAGS.save_every_n)]) # train_writer.add_summary(ac_scalar, step_global) sum_loss = 0 # sum_dev_loss = 0 sum_acc_t = 0 # sum_acc_d = 0 start = time.time()
def test_sent(sess): # _, _, test_corpus = reader.read_corpus(index='1_0.2', pick_train=False, pick_valid=False, pick_test=True) # test_corpus, _, _ = reader.read_corpus(index=0, pick_train=True, pick_valid=False, pick_test=False) _, _, test_corpus = reader.read_corpus(index='yhwc_150', pick_train=False, pick_valid=False, pick_test=True) glossary, word2id = reader.read_glossary() test_inputs = [] test_lenth = [] test_labels = [] test_num = 0 for item in test_corpus: test_inputs.append(item[0][0]) test_lenth.append(int(item[0][1])) if item[1] in [1, 'T', 1.0]: test_labels.append(1) elif item[1] in [0, 'F', 0.0]: test_labels.append(0) test_num += 1 model = em_sent( batch_size=FLAGS.batch_size, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth ) model.build_test_graph(150) saver = tf.train.Saver() test_labels = np.reshape(test_labels, [test_num, 1]) if restore_from_checkpoint(sess, saver, 'save/pt-bi-lstm-attn'): # test_loss, accuracy, expection, w2v, alpha = sess.run( # [model.test_loss, model.test_accuracy, model.expection, model.embeddings, model.alpha], # feed_dict=test_feed_dict) total_test_loss = 0 total_accuracy = 0 total_expection = [] # threshold = 0.9 for piece_inputs, piece_lenth, piece_labels in get_test_batches(test_inputs, test_lenth, test_labels, test_num): piece_num = len(piece_inputs) test_feed_dict = { model.test_inputs: piece_inputs, model.test_lenth: piece_lenth, model.test_labels: piece_labels } test_loss, accuracy, expection, w2v = sess.run( [model.test_loss, model.test_accuracy, model.expection, model.embeddings], feed_dict=test_feed_dict) total_test_loss += test_loss * piece_num total_accuracy += accuracy * piece_num total_expection.extend(expection) # for i in range(len(expection)): # if expection[i] < threshold: # logit = 0 # else: # logit = relu # total_expection.append(logit) # if logit == piece_labels[i]: # total_accuracy += relu total_test_loss /= test_num total_accuracy /= test_num for i in range(test_num): print(i, [glossary[word] for word in test_inputs[i]]) print(test_inputs[i]) # print(alpha[i]) print(test_labels[i], total_expection[i]) def f_value(): # 真正例 TP = 0 # 假正例 FP = 0 # 假反例 FN = 0 # 真反例 TN = 0 # We pay more attention on negative samples. for i in range(test_num): if test_labels[i] == 0 and total_expection[i] == 0: TP += 1 elif test_labels[i] == 0 and total_expection[i] == 1: FN += 1 elif test_labels[i] == 1 and total_expection[i] == 0: FP += 1 elif test_labels[i] == 1 and total_expection[i] == 1: TN += 1 P = TP / (TP + FP + 0.0001) R = TP / (TP + FN + 0.0001) F = 2 * P * R / (P + R + 0.0001) P_ = TN / (TN + FN + 0.0001) R_ = TN / (TN + FP + 0.0001) F_ = 2 * P_ * R_ / (P_ + R_ + 0.0001) ACC = (TP + TN) / (TP + FP + TN + FN + 0.0001) print("Validation: Average loss: {};".format(total_test_loss)) print(" accuracy rate: {:.4f}".format(total_accuracy)) print("About negative samples:") print(" precision rate: {:.4f}".format(P)) print(" recall rate: {:.4f}".format(R)) print(" f-value: {:.4f}".format(F)) print("About positive samples:") print(" precision rate: {:.4f}".format(P_)) print(" recall rate: {:.4f}".format(R_)) print(" f-value: {:.4f}".format(F_)) f_value() return total_expection else: print("error!")
def test(corpus): test_inputs = [] test_lenths = [] test_labels = [] test_num = 0 for item in corpus: test_inputs.append(item[0]) test_lenths.append(item[1]) if item[2] in [0, 'T', 0.0]: test_labels.append(0) elif item[2] in [1, 'F', 1.0]: test_labels.append(1) test_num += 1 with tf.Graph().as_default(), tf.Session() as sess: model = em_sent( # batch_size=FLAGS.batch_size, seq_size=FLAGS.seq_lenth, glossary_size=FLAGS.glossary_size, embedding_size=FLAGS.embedding_size, hidden_size=FLAGS.hidden_size, attn_lenth=FLAGS.attn_lenth, is_training=False) model.buildTrainGraph() saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10) if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir): total_test_loss = 0 total_accuracy = 0 total_expection = [] print(test_num) for piece_inputs, piece_lenths, piece_labels in \ get_test_batch(test_inputs, test_lenths, test_labels, test_num): piece_num = len(piece_inputs) test_feed_dict = { model.inputs: piece_inputs, model.lenths: piece_lenths, model.lenths_weight: padded_ones_list_like(piece_lenths, FLAGS.seq_lenth), model.labels: piece_labels } test_loss, accuracy, expection, w2v = sess.run( [ model.loss, model.accuracy, model.expection, model.embeddings ], feed_dict=test_feed_dict) total_test_loss += test_loss * piece_num total_accuracy += accuracy * piece_num total_expection.extend(expection) total_test_loss /= test_num total_accuracy /= test_num # for i in range(test_num): # print(i, [vocab[word] for word in test_inputs[i]]) # print(test_inputs[i]) # # print(alpha[i]) # print(test_labels[i], total_expection[i]) def f_value(): # 真正例 TP = 0 # 假正例 FP = 0 # 假反例 FN = 0 # 真反例 TN = 0 for i in range(test_num): if test_labels[i] == 0 and total_expection[i] == 0: TP += 1 elif test_labels[i] == 0 and total_expection[i] == 1: FN += 1 elif test_labels[i] == 1 and total_expection[i] == 0: FP += 1 elif test_labels[i] == 1 and total_expection[i] == 1: TN += 1 P = TP / (TP + FP + 0.0001) R = TP / (TP + FN + 0.0001) F = 2 * P * R / (P + R + 0.0001) P_ = TN / (TN + FN + 0.0001) R_ = TN / (TN + FP + 0.0001) F_ = 2 * P_ * R_ / (P_ + R_ + 0.0001) ACC = (TP + TN) / (TP + FP + TN + FN + 0.0001) print("Validation: Average loss: {};".format(total_test_loss)) print(" accuracy rate: {:.4f}".format(total_accuracy)) print("About positive samples:") print(" precision rate: {:.4f}".format(P)) print(" recall rate: {:.4f}".format(R)) print(" f-value: {:.4f}".format(F)) print("About negative samples:") print(" precision rate: {:.4f}".format(P_)) print(" recall rate: {:.4f}".format(R_)) print(" f-value: {:.4f}".format(F_)) f_value() else: print("error!")