Ejemplo n.º 1
0
def predict(input_path, output_path):
    df_i = pd.read_excel(input_path)
    corpus_i = df_i.iloc[:, [1]]
    corpus_i = np.array(corpus_i).tolist()

    corpus = reader.preprocess(reader.read_excel(input_path, text_column=1),
                               seq_lenth=FLAGS.seq_lenth,
                               seq_num=1,
                               overlap_lenth=0,
                               input_label=False,
                               output_index=True)
    # vocab, word2id = reader.read_glossary()

    test_inputs = []
    test_lenths = []
    test_num = 0
    for item in corpus:
        test_inputs.append(item[0])
        test_lenths.append(item[1])
        test_num += 1

    with tf.Graph().as_default(), tf.Session() as sess:
        model = em_sent(seq_size=FLAGS.seq_lenth,
                        glossary_size=FLAGS.glossary_size,
                        embedding_size=FLAGS.embedding_size,
                        hidden_size=FLAGS.hidden_size,
                        attn_lenth=FLAGS.attn_lenth,
                        is_training=False)
        model.buildTrainGraph()

        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10)

        if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir):
            total_expection = []
            print(test_num)
            for piece_inputs, piece_lenths in get_test_batch(
                    test_inputs, test_lenths, None, test_num,
                    input_label=False):
                test_feed_dict = {
                    model.inputs:
                    piece_inputs,
                    model.lenths:
                    piece_lenths,
                    model.lenths_weight:
                    padded_ones_list_like(piece_lenths, FLAGS.seq_lenth),
                }
                expection = sess.run(model.expection, feed_dict=test_feed_dict)
                total_expection.extend(expection)

            zipped = []
            for index in range(test_num):
                zipped.append([
                    corpus_i[corpus[index][2]],
                    'T' if total_expection[index][0] == 0 else 'F'
                ])
            df_o = pd.DataFrame(zipped)
            writer = pd.ExcelWriter(output_path)
            df_o.to_excel(writer, 'Sheet1')
            writer.save()
Ejemplo n.º 2
0
def test_onesent(text):
    corpus = reader.preprocess([[text]],
                               seq_lenth=FLAGS.seq_lenth,
                               seq_num=1,
                               overlap_lenth=0,
                               input_label=False,
                               output_index=False)
    vocab, word2id = reader.read_glossary()

    print(corpus)
    test_inputs = []
    test_lenths = []
    test_num = 0
    for item in corpus:
        test_inputs.append(item[0])
        test_lenths.append(item[1])
        test_num += 1

    with tf.Graph().as_default(), tf.Session() as sess:
        model = em_sent(seq_size=FLAGS.seq_lenth,
                        glossary_size=FLAGS.glossary_size,
                        embedding_size=FLAGS.embedding_size,
                        hidden_size=FLAGS.hidden_size,
                        attn_lenth=FLAGS.attn_lenth,
                        is_training=False)
        model.buildTrainGraph()

        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10)

        if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir):
            test_feed_dict = {
                model.inputs:
                test_inputs,
                model.lenths:
                test_lenths,
                model.lenths_weight:
                padded_ones_list_like(test_lenths, FLAGS.seq_lenth),
            }
            expection, alpha, logits = sess.run(
                [model.expection, model.alpha, model.logits],
                feed_dict=test_feed_dict)

            print([vocab[i] for i in test_inputs[0]])
            print([vocab[word] for word in test_inputs])

            for i in range(len(test_inputs[0])):
                print(vocab[test_inputs[0][i]], alpha[0][i], logits[0][i])

            if (expection[0][0] == 1):
                print('负面')
            else:
                print('正面')

            return expection[0]
Ejemplo n.º 3
0
def train(sess):
    # Pretreatment
    print("Read file --")
    start = time.time()

    # id2word, word2id = reader.read_glossary()
    train_corpus, _, _ = reader.read_corpus(index='relu', pick_valid=False, pick_test=False)
    pretrained_wv = reader.read_initw2v()

    end = time.time()
    print("Read finished -- {:.4f} sec".format(end-start))

    # Build model
    print("Building model --")
    start = end

    # model = em_doc(
    #     max_seq_size=120,
    #     glossary_size=FLAGS.glossary_size,
    #     embedding_size=FLAGS.embedding_size,
    #     hidden_size=FLAGS.hidden_size,
    #     attn_lenth=FLAGS.attn_lenth,
    #     learning_rate=0.01
    # )
    model = em_sent(
        batch_size=FLAGS.batch_size,
        glossary_size=FLAGS.glossary_size,
        embedding_size=FLAGS.embedding_size,
        hidden_size=FLAGS.hidden_size,
        attn_lenth=FLAGS.attn_lenth
    )
    model.buildTrainGraph()

    init = tf.global_variables_initializer()
    sess.run(init, feed_dict={model.pretrained_wv: pretrained_wv})
    sess.run(init)

    saver = tf.train.Saver(tf.trainable_variables(),
        # [
        #     model.embeddings,
        #     model.lstm_fw_cell.weights,
        #     model.lstm_bw_cell.weights,
        #     model.attn_w,
        #     model.attn_b,
        #     model.attn_u,
        #     model.inte_attn_w,
        #     model.inte_attn_b,
        #     model.inte_attn_u,
        #     model.merge_inde_w,
        #     model.merge_inde_b,
        #     model.merge_inte_w,
        #     model.merge_inte_b
        # ],
        max_to_keep=10)
    train_writer = tf.summary.FileWriter(logdir=FLAGS.tensorboard_dir, graph=sess.graph)

    end = time.time()
    print("Building model finished -- {:.4f} sec".format(end - start))

    if not restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir):
        return
    step_global = 0
    sum_loss = 0
    # sum_dev_loss = 0
    sum_acc_t = 0
    # sum_acc_d = 0
    # max_acc = 0

    print("Training initialized")
    start = time.time()

    for epoch in range(FLAGS.epoches):
        for train_title_input, train_title_lenth, train_text_inputs, train_text_lenths, train_label in get_piece(train_corpus):
            step_global += 1
            feed_dict = {
                model.title_input: train_title_input,
                model.title_lenth: train_title_lenth,
                model.text_inputs: train_text_inputs,
                model.text_lenths: train_text_lenths,
                model.label: train_label
            }

            loss, _, t_scalar, t_acc = sess.run([model.loss,
                                                model.optimizer,
                                                model.train_scalar,
                                                model.train_accuracy],
                                         feed_dict=feed_dict)
            # print(aaaa, bbbb, loss)
            sum_loss += loss
            sum_acc_t += t_acc

            # for dev_inputs, dev_lenth, dev_labels in get_batches(valid_inputs, valid_lenth, valid_labels, valid_num):
            #     dev_feed_dict = {
            #         model.dev_inputs: dev_inputs,
            #         model.dev_lenth: dev_lenth,
            #         model.dev_labels: dev_labels
            #     }
            #     dev_loss, d_scalar, d_acc, w2v = sess.run([model.dev_loss,
            #                                                model.dev_scalar,
            #                                                model.dev_accuracy,
            #                                                model.embeddings],
            #                                               feed_dict=dev_feed_dict)
            #     sum_dev_loss += dev_loss
            #     sum_acc_d += d_acc
            #
            # sum_dev_loss /= valid_num
            # sum_acc_d /= valid_num

            # def eval_ws(ws_list):
            #     from scipy import stats
            #     from numpy import linalg as LA
            #
            #     logits = []
            #     real = []
            #     eval = []
            #
            #     for iter_ws in ws_list:
            #         if iter_ws[0] not in id2word or iter_ws[relu] not in id2word:
            #             continue
            #         else:
            #             A = word2id[iter_ws[0]]
            #             B = word2id[iter_ws[relu]]
            #             real.append(iter_ws[2])
            #             logits.extend([w2v[A], w2v[B]])
            #
            #     for i in range(len(logits) // 2):
            #         A_vec = logits[2 * i]
            #         B_vec = logits[2 * i + relu]
            #         normed_A_vec = LA.norm(A_vec, axis=0)
            #         normed_B_vec = LA.norm(B_vec, axis=0)
            #         sim = sum(np.multiply(A_vec, B_vec))
            #         eval.append(sim / normed_A_vec / normed_B_vec)
            #
            #     pearsonr = stats.pearsonr(real, eval)[0]
            #     spearmanr = stats.spearmanr(real, eval).correlation
            #     return pearsonr, spearmanr


            if step_global % FLAGS.save_every_n == 0:
                end = time.time()
                print("Training: Average loss at step {}: {};".format(step_global, sum_loss[0] / FLAGS.save_every_n),
                      "time: {:.4f} sec;".format(end - start),
                      "accuracy rate: {:.4f}".format(sum_acc_t[0] / FLAGS.save_every_n))
                # print("Validation: Average loss: {};".format(sum_dev_loss / FLAGS.save_every_n),
                #       "accuracy rate: {:.4f}".format(sum_acc_d / FLAGS.save_every_n))

                saver.save(sess, FLAGS.ckpt_dir + "/step{}.ckpt".format(step_global))

                train_writer.add_summary(t_scalar, step_global)
                # ac_scalar = tf.Summary(value=[tf.Summary.Value(tag="accuracy rate", simple_value=sum_acc_d / FLAGS.save_every_n)])
                # train_writer.add_summary(ac_scalar, step_global)

                sum_loss = 0
                # sum_dev_loss = 0
                sum_acc_t = 0
                # sum_acc_d = 0

                start = time.time()
Ejemplo n.º 4
0
def test_sent(sess):
    # _, _, test_corpus = reader.read_corpus(index='1_0.2', pick_train=False, pick_valid=False, pick_test=True)
    # test_corpus, _, _ = reader.read_corpus(index=0, pick_train=True, pick_valid=False, pick_test=False)
    _, _, test_corpus = reader.read_corpus(index='yhwc_150', pick_train=False, pick_valid=False, pick_test=True)

    glossary, word2id = reader.read_glossary()

    test_inputs = []
    test_lenth = []
    test_labels = []
    test_num = 0
    for item in test_corpus:
        test_inputs.append(item[0][0])
        test_lenth.append(int(item[0][1]))
        if item[1] in [1, 'T', 1.0]:
            test_labels.append(1)
        elif item[1] in [0, 'F', 0.0]:
            test_labels.append(0)
        test_num += 1

    model = em_sent(
        batch_size=FLAGS.batch_size,
        glossary_size=FLAGS.glossary_size,
        embedding_size=FLAGS.embedding_size,
        hidden_size=FLAGS.hidden_size,
        attn_lenth=FLAGS.attn_lenth
    )
    model.build_test_graph(150)

    saver = tf.train.Saver()
    test_labels = np.reshape(test_labels, [test_num, 1])

    if restore_from_checkpoint(sess, saver, 'save/pt-bi-lstm-attn'):
        # test_loss, accuracy, expection, w2v, alpha = sess.run(
        #     [model.test_loss, model.test_accuracy, model.expection, model.embeddings, model.alpha],
        #                               feed_dict=test_feed_dict)
        total_test_loss = 0
        total_accuracy = 0
        total_expection = []
        # threshold = 0.9
        for piece_inputs, piece_lenth, piece_labels in get_test_batches(test_inputs, test_lenth, test_labels, test_num):
            piece_num = len(piece_inputs)
            test_feed_dict = {
                model.test_inputs: piece_inputs,
                model.test_lenth: piece_lenth,
                model.test_labels: piece_labels
            }
            test_loss, accuracy, expection, w2v = sess.run(
                [model.test_loss, model.test_accuracy, model.expection, model.embeddings],
                feed_dict=test_feed_dict)
            total_test_loss += test_loss * piece_num
            total_accuracy += accuracy * piece_num
            total_expection.extend(expection)
            # for i in range(len(expection)):
            #     if expection[i] < threshold:
            #         logit = 0
            #     else:
            #         logit = relu
            #     total_expection.append(logit)
            #     if logit == piece_labels[i]:
            #         total_accuracy += relu

        total_test_loss /= test_num
        total_accuracy /= test_num
        for i in range(test_num):
            print(i, [glossary[word] for word in test_inputs[i]])
            print(test_inputs[i])
            # print(alpha[i])
            print(test_labels[i], total_expection[i])

        def f_value():
            # 真正例
            TP = 0
            # 假正例
            FP = 0
            # 假反例
            FN = 0
            # 真反例
            TN = 0

            # We pay more attention on negative samples.
            for i in range(test_num):
                if test_labels[i] == 0 and total_expection[i] == 0:
                    TP += 1
                elif test_labels[i] == 0 and total_expection[i] == 1:
                    FN += 1
                elif test_labels[i] == 1 and total_expection[i] == 0:
                    FP += 1
                elif test_labels[i] == 1 and total_expection[i] == 1:
                    TN += 1

            P = TP / (TP + FP + 0.0001)
            R = TP / (TP + FN + 0.0001)
            F = 2 * P * R / (P + R + 0.0001)
            P_ = TN / (TN + FN + 0.0001)
            R_ = TN / (TN + FP + 0.0001)
            F_ = 2 * P_ * R_ / (P_ + R_ + 0.0001)
            ACC = (TP + TN) / (TP + FP + TN + FN + 0.0001)
            print("Validation: Average loss: {};".format(total_test_loss))
            print("     accuracy rate: {:.4f}".format(total_accuracy))
            print("About negative samples:")
            print("     precision rate: {:.4f}".format(P))
            print("     recall rate: {:.4f}".format(R))
            print("     f-value: {:.4f}".format(F))

            print("About positive samples:")
            print("     precision rate: {:.4f}".format(P_))
            print("     recall rate: {:.4f}".format(R_))
            print("     f-value: {:.4f}".format(F_))

        f_value()
        return total_expection
    else:
        print("error!")
Ejemplo n.º 5
0
def test(corpus):
    test_inputs = []
    test_lenths = []
    test_labels = []
    test_num = 0
    for item in corpus:
        test_inputs.append(item[0])
        test_lenths.append(item[1])
        if item[2] in [0, 'T', 0.0]:
            test_labels.append(0)
        elif item[2] in [1, 'F', 1.0]:
            test_labels.append(1)
        test_num += 1

    with tf.Graph().as_default(), tf.Session() as sess:
        model = em_sent(
            # batch_size=FLAGS.batch_size,
            seq_size=FLAGS.seq_lenth,
            glossary_size=FLAGS.glossary_size,
            embedding_size=FLAGS.embedding_size,
            hidden_size=FLAGS.hidden_size,
            attn_lenth=FLAGS.attn_lenth,
            is_training=False)
        model.buildTrainGraph()

        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10)

        if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir):
            total_test_loss = 0
            total_accuracy = 0
            total_expection = []
            print(test_num)
            for piece_inputs, piece_lenths, piece_labels in \
                    get_test_batch(test_inputs, test_lenths, test_labels, test_num):
                piece_num = len(piece_inputs)
                test_feed_dict = {
                    model.inputs:
                    piece_inputs,
                    model.lenths:
                    piece_lenths,
                    model.lenths_weight:
                    padded_ones_list_like(piece_lenths, FLAGS.seq_lenth),
                    model.labels:
                    piece_labels
                }
                test_loss, accuracy, expection, w2v = sess.run(
                    [
                        model.loss, model.accuracy, model.expection,
                        model.embeddings
                    ],
                    feed_dict=test_feed_dict)
                total_test_loss += test_loss * piece_num
                total_accuracy += accuracy * piece_num
                total_expection.extend(expection)

            total_test_loss /= test_num
            total_accuracy /= test_num

            # for i in range(test_num):
            #     print(i, [vocab[word] for word in test_inputs[i]])
            #     print(test_inputs[i])
            #     # print(alpha[i])
            #     print(test_labels[i], total_expection[i])

            def f_value():
                # 真正例
                TP = 0
                # 假正例
                FP = 0
                # 假反例
                FN = 0
                # 真反例
                TN = 0

                for i in range(test_num):
                    if test_labels[i] == 0 and total_expection[i] == 0:
                        TP += 1
                    elif test_labels[i] == 0 and total_expection[i] == 1:
                        FN += 1
                    elif test_labels[i] == 1 and total_expection[i] == 0:
                        FP += 1
                    elif test_labels[i] == 1 and total_expection[i] == 1:
                        TN += 1

                P = TP / (TP + FP + 0.0001)
                R = TP / (TP + FN + 0.0001)
                F = 2 * P * R / (P + R + 0.0001)
                P_ = TN / (TN + FN + 0.0001)
                R_ = TN / (TN + FP + 0.0001)
                F_ = 2 * P_ * R_ / (P_ + R_ + 0.0001)
                ACC = (TP + TN) / (TP + FP + TN + FN + 0.0001)
                print("Validation: Average loss: {};".format(total_test_loss))
                print("     accuracy rate: {:.4f}".format(total_accuracy))
                print("About positive samples:")
                print("     precision rate: {:.4f}".format(P))
                print("     recall rate: {:.4f}".format(R))
                print("     f-value: {:.4f}".format(F))

                print("About negative samples:")
                print("     precision rate: {:.4f}".format(P_))
                print("     recall rate: {:.4f}".format(R_))
                print("     f-value: {:.4f}".format(F_))

            f_value()
        else:
            print("error!")