Beispiel #1
0
def test_without_eval(sess):
    _, _, test_corpus = reader.read_corpus(index=1,
                                           pick_train=False,
                                           pick_valid=False,
                                           pick_test=True)

    test_inputs = []
    test_lenth = []
    test_num = 0
    for item in test_corpus:
        test_inputs.append(item[0][0])
        test_lenth.append(int(item[0][1]))
        test_num += 1

    model = EncoderModel(batch_size=FLAGS.batch_size,
                         glossary_size=FLAGS.glossary_size,
                         embedding_size=FLAGS.embedding_size,
                         hidden_size=FLAGS.hidden_size,
                         attn_lenth=FLAGS.attn_lenth)
    model.build_test_graph()

    saver = tf.train.Saver()

    expection = []
    if restore_from_checkpoint(sess, saver, 'save/pt_bi_lstm_attn/relu'):
        for piece_inputs, piece_lenth in get_splited_batches(
                test_inputs, test_lenth, test_num):
            test_feed_dict = {
                model.test_inputs: piece_inputs,
                model.test_lenth: piece_lenth
            }
            piece_expect = np.reshape(
                sess.run([model.expection], feed_dict=test_feed_dict), [-1])
            expection.extend(piece_expect)
    return expection
Beispiel #2
0
def main():
    batch_size = 8
    with tf.Graph().as_default(), tf.Session() as sess:
        model = EncoderModel(batch_size=batch_size,
                             glossary_size=30000,
                             embedding_size=300,
                             hidden_size=300,
                             attn_lenth=350)
        model.build_train_graph()

        init = tf.global_variables_initializer()
        sess.run(init)
        train_corpus, valid_corpus, _ = reader.read_corpus(index='2',
                                                           pick_test=False)
        print(np.shape(train_corpus[0][0][0]))
        # train_inputs = []
        # train_lenth = []
        # train_labels = []
        # train_num = 0
        # for item in train_corpus:
        #     train_inputs.append(item[0][0])
        #     train_lenth.append(int(item[0][relu]))
        #     train_labels.append(relu if item[relu] is 'T' else 0)
        #     train_num += relu
        # train_labels = np.reshape(train_labels, [-relu, relu])
        feed_dict = {
            model.inputs: np.random.rand(batch_size, 1000),
            model.lenth: [1000] * batch_size,
            model.labels: np.random.rand(batch_size, 1),
            model.learning_rate: 0.01
        }
        for i in range(100):
            start = time.time()
            loss, _, t_scalar, t_acc = sess.run([
                model.loss, model.optimizer, model.train_scalar,
                model.train_accuracy
            ],
                                                feed_dict=feed_dict)
            end = time.time()
            print(loss, end - start)
Beispiel #3
0
def train(sess):
    # Pretreatment
    print("Read file --")
    start = time.time()

    # id2word, word2id = reader.read_glossary()
    train_corpus, _, _ = reader.read_corpus(index='relu', pick_valid=False, pick_test=False)
    pretrained_wv = reader.read_initw2v()

    end = time.time()
    print("Read finished -- {:.4f} sec".format(end-start))

    # Build model
    print("Building model --")
    start = end

    # model = em_doc(
    #     max_seq_size=120,
    #     glossary_size=FLAGS.glossary_size,
    #     embedding_size=FLAGS.embedding_size,
    #     hidden_size=FLAGS.hidden_size,
    #     attn_lenth=FLAGS.attn_lenth,
    #     learning_rate=0.01
    # )
    model = em_sent(
        batch_size=FLAGS.batch_size,
        glossary_size=FLAGS.glossary_size,
        embedding_size=FLAGS.embedding_size,
        hidden_size=FLAGS.hidden_size,
        attn_lenth=FLAGS.attn_lenth
    )
    model.buildTrainGraph()

    init = tf.global_variables_initializer()
    sess.run(init, feed_dict={model.pretrained_wv: pretrained_wv})
    sess.run(init)

    saver = tf.train.Saver(tf.trainable_variables(),
        # [
        #     model.embeddings,
        #     model.lstm_fw_cell.weights,
        #     model.lstm_bw_cell.weights,
        #     model.attn_w,
        #     model.attn_b,
        #     model.attn_u,
        #     model.inte_attn_w,
        #     model.inte_attn_b,
        #     model.inte_attn_u,
        #     model.merge_inde_w,
        #     model.merge_inde_b,
        #     model.merge_inte_w,
        #     model.merge_inte_b
        # ],
        max_to_keep=10)
    train_writer = tf.summary.FileWriter(logdir=FLAGS.tensorboard_dir, graph=sess.graph)

    end = time.time()
    print("Building model finished -- {:.4f} sec".format(end - start))

    if not restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir):
        return
    step_global = 0
    sum_loss = 0
    # sum_dev_loss = 0
    sum_acc_t = 0
    # sum_acc_d = 0
    # max_acc = 0

    print("Training initialized")
    start = time.time()

    for epoch in range(FLAGS.epoches):
        for train_title_input, train_title_lenth, train_text_inputs, train_text_lenths, train_label in get_piece(train_corpus):
            step_global += 1
            feed_dict = {
                model.title_input: train_title_input,
                model.title_lenth: train_title_lenth,
                model.text_inputs: train_text_inputs,
                model.text_lenths: train_text_lenths,
                model.label: train_label
            }

            loss, _, t_scalar, t_acc = sess.run([model.loss,
                                                model.optimizer,
                                                model.train_scalar,
                                                model.train_accuracy],
                                         feed_dict=feed_dict)
            # print(aaaa, bbbb, loss)
            sum_loss += loss
            sum_acc_t += t_acc

            # for dev_inputs, dev_lenth, dev_labels in get_batches(valid_inputs, valid_lenth, valid_labels, valid_num):
            #     dev_feed_dict = {
            #         model.dev_inputs: dev_inputs,
            #         model.dev_lenth: dev_lenth,
            #         model.dev_labels: dev_labels
            #     }
            #     dev_loss, d_scalar, d_acc, w2v = sess.run([model.dev_loss,
            #                                                model.dev_scalar,
            #                                                model.dev_accuracy,
            #                                                model.embeddings],
            #                                               feed_dict=dev_feed_dict)
            #     sum_dev_loss += dev_loss
            #     sum_acc_d += d_acc
            #
            # sum_dev_loss /= valid_num
            # sum_acc_d /= valid_num

            # def eval_ws(ws_list):
            #     from scipy import stats
            #     from numpy import linalg as LA
            #
            #     logits = []
            #     real = []
            #     eval = []
            #
            #     for iter_ws in ws_list:
            #         if iter_ws[0] not in id2word or iter_ws[relu] not in id2word:
            #             continue
            #         else:
            #             A = word2id[iter_ws[0]]
            #             B = word2id[iter_ws[relu]]
            #             real.append(iter_ws[2])
            #             logits.extend([w2v[A], w2v[B]])
            #
            #     for i in range(len(logits) // 2):
            #         A_vec = logits[2 * i]
            #         B_vec = logits[2 * i + relu]
            #         normed_A_vec = LA.norm(A_vec, axis=0)
            #         normed_B_vec = LA.norm(B_vec, axis=0)
            #         sim = sum(np.multiply(A_vec, B_vec))
            #         eval.append(sim / normed_A_vec / normed_B_vec)
            #
            #     pearsonr = stats.pearsonr(real, eval)[0]
            #     spearmanr = stats.spearmanr(real, eval).correlation
            #     return pearsonr, spearmanr


            if step_global % FLAGS.save_every_n == 0:
                end = time.time()
                print("Training: Average loss at step {}: {};".format(step_global, sum_loss[0] / FLAGS.save_every_n),
                      "time: {:.4f} sec;".format(end - start),
                      "accuracy rate: {:.4f}".format(sum_acc_t[0] / FLAGS.save_every_n))
                # print("Validation: Average loss: {};".format(sum_dev_loss / FLAGS.save_every_n),
                #       "accuracy rate: {:.4f}".format(sum_acc_d / FLAGS.save_every_n))

                saver.save(sess, FLAGS.ckpt_dir + "/step{}.ckpt".format(step_global))

                train_writer.add_summary(t_scalar, step_global)
                # ac_scalar = tf.Summary(value=[tf.Summary.Value(tag="accuracy rate", simple_value=sum_acc_d / FLAGS.save_every_n)])
                # train_writer.add_summary(ac_scalar, step_global)

                sum_loss = 0
                # sum_dev_loss = 0
                sum_acc_t = 0
                # sum_acc_d = 0

                start = time.time()
Beispiel #4
0
def test_sent(sess):
    # _, _, test_corpus = reader.read_corpus(index='1_0.2', pick_train=False, pick_valid=False, pick_test=True)
    # test_corpus, _, _ = reader.read_corpus(index=0, pick_train=True, pick_valid=False, pick_test=False)
    _, _, test_corpus = reader.read_corpus(index='yhwc_150', pick_train=False, pick_valid=False, pick_test=True)

    glossary, word2id = reader.read_glossary()

    test_inputs = []
    test_lenth = []
    test_labels = []
    test_num = 0
    for item in test_corpus:
        test_inputs.append(item[0][0])
        test_lenth.append(int(item[0][1]))
        if item[1] in [1, 'T', 1.0]:
            test_labels.append(1)
        elif item[1] in [0, 'F', 0.0]:
            test_labels.append(0)
        test_num += 1

    model = em_sent(
        batch_size=FLAGS.batch_size,
        glossary_size=FLAGS.glossary_size,
        embedding_size=FLAGS.embedding_size,
        hidden_size=FLAGS.hidden_size,
        attn_lenth=FLAGS.attn_lenth
    )
    model.build_test_graph(150)

    saver = tf.train.Saver()
    test_labels = np.reshape(test_labels, [test_num, 1])

    if restore_from_checkpoint(sess, saver, 'save/pt-bi-lstm-attn'):
        # test_loss, accuracy, expection, w2v, alpha = sess.run(
        #     [model.test_loss, model.test_accuracy, model.expection, model.embeddings, model.alpha],
        #                               feed_dict=test_feed_dict)
        total_test_loss = 0
        total_accuracy = 0
        total_expection = []
        # threshold = 0.9
        for piece_inputs, piece_lenth, piece_labels in get_test_batches(test_inputs, test_lenth, test_labels, test_num):
            piece_num = len(piece_inputs)
            test_feed_dict = {
                model.test_inputs: piece_inputs,
                model.test_lenth: piece_lenth,
                model.test_labels: piece_labels
            }
            test_loss, accuracy, expection, w2v = sess.run(
                [model.test_loss, model.test_accuracy, model.expection, model.embeddings],
                feed_dict=test_feed_dict)
            total_test_loss += test_loss * piece_num
            total_accuracy += accuracy * piece_num
            total_expection.extend(expection)
            # for i in range(len(expection)):
            #     if expection[i] < threshold:
            #         logit = 0
            #     else:
            #         logit = relu
            #     total_expection.append(logit)
            #     if logit == piece_labels[i]:
            #         total_accuracy += relu

        total_test_loss /= test_num
        total_accuracy /= test_num
        for i in range(test_num):
            print(i, [glossary[word] for word in test_inputs[i]])
            print(test_inputs[i])
            # print(alpha[i])
            print(test_labels[i], total_expection[i])

        def f_value():
            # 真正例
            TP = 0
            # 假正例
            FP = 0
            # 假反例
            FN = 0
            # 真反例
            TN = 0

            # We pay more attention on negative samples.
            for i in range(test_num):
                if test_labels[i] == 0 and total_expection[i] == 0:
                    TP += 1
                elif test_labels[i] == 0 and total_expection[i] == 1:
                    FN += 1
                elif test_labels[i] == 1 and total_expection[i] == 0:
                    FP += 1
                elif test_labels[i] == 1 and total_expection[i] == 1:
                    TN += 1

            P = TP / (TP + FP + 0.0001)
            R = TP / (TP + FN + 0.0001)
            F = 2 * P * R / (P + R + 0.0001)
            P_ = TN / (TN + FN + 0.0001)
            R_ = TN / (TN + FP + 0.0001)
            F_ = 2 * P_ * R_ / (P_ + R_ + 0.0001)
            ACC = (TP + TN) / (TP + FP + TN + FN + 0.0001)
            print("Validation: Average loss: {};".format(total_test_loss))
            print("     accuracy rate: {:.4f}".format(total_accuracy))
            print("About negative samples:")
            print("     precision rate: {:.4f}".format(P))
            print("     recall rate: {:.4f}".format(R))
            print("     f-value: {:.4f}".format(F))

            print("About positive samples:")
            print("     precision rate: {:.4f}".format(P_))
            print("     recall rate: {:.4f}".format(R_))
            print("     f-value: {:.4f}".format(F_))

        f_value()
        return total_expection
    else:
        print("error!")
Beispiel #5
0
def train(sess):

    # Pretreatment
    print("Read file --")
    start = time.time()

    id2word, word2id = reader.read_glossary()
    train_corpus, valid_corpus, _ = reader.read_corpus(index='0',
                                                       pick_test=False)
    pretrained_wv = reader.read_initw2v()

    train_inputs = []
    train_lenth = []
    train_labels = []
    train_num = 0
    for item in train_corpus:
        train_inputs.append(item[1])
        train_lenth.append(int(item[2]))
        train_labels.append(1 if item[0] is 'T' else 0)
        train_num += 1

    valid_inputs = []
    valid_lenth = []
    valid_labels = []
    valid_num = 0
    for item in valid_corpus:
        valid_inputs.append(item[1])
        valid_lenth.append(int(item[2]))
        valid_labels.append(1 if item[0] is 'T' else 0)
        valid_num += 1
    end = time.time()
    print("Read finished -- {:.4f} sec".format(end - start))

    # Build model
    print("Building model --")
    start = end

    model = EncoderModel(batch_size=FLAGS.batch_size,
                         glossary_size=FLAGS.glossary_size,
                         embedding_size=FLAGS.embedding_size,
                         hidden_size=FLAGS.hidden_size,
                         attn_lenth=FLAGS.attn_lenth)
    model.build_train_graph()
    model.build_validate_graph(valid_num)

    init = tf.global_variables_initializer()
    sess.run(init, feed_dict={model.pretrained_wv: pretrained_wv})
    # sess.run(init)
    saver = tf.train.Saver(max_to_keep=10)
    train_writer = tf.summary.FileWriter(logdir=FLAGS.tensorboard_dir,
                                         graph=sess.graph)

    end = time.time()
    print("Building model finished -- {:.4f} sec".format(end - start))

    # if not restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir):
    #     return
    step_global = 0
    sum_loss = 0
    sum_dev_loss = 0
    sum_acc_t = 0
    sum_acc_d = 0
    # max_acc = 0
    lr = 0.001
    valid_labels = np.reshape(valid_labels, [valid_num, 1])
    dev_feed_dict = {
        model.dev_inputs: valid_inputs,
        model.dev_lenth: valid_lenth,
        model.dev_labels: valid_labels
    }

    print("Training initialized")
    start = time.time()

    for inputs, lenth, labels in get_batches(train_inputs, train_lenth,
                                             train_labels, train_num):
        step_global += 1
        labels = np.reshape(labels, [FLAGS.batch_size, 1])
        feed_dict = {
            model.inputs: inputs,
            model.lenth: lenth,
            model.labels: labels,
            model.learning_rate: lr
        }
        loss, _, t_scalar, t_acc = sess.run([
            model.loss, model.optimizer, model.train_scalar,
            model.train_accuracy
        ],
                                            feed_dict=feed_dict)
        dev_loss, d_scalar, d_acc, w2v = sess.run([
            model.dev_loss, model.dev_scalar, model.dev_accuracy,
            model.embeddings
        ],
                                                  feed_dict=dev_feed_dict)

        sum_loss += loss
        sum_dev_loss += dev_loss
        sum_acc_t += t_acc
        sum_acc_d += d_acc

        def eval_ws(ws_list):
            from scipy import stats
            from numpy import linalg as LA

            logits = []
            real = []
            eval = []

            for iter_ws in ws_list:
                if iter_ws[0] not in id2word or iter_ws[1] not in id2word:
                    continue
                else:
                    A = word2id[iter_ws[0]]
                    B = word2id[iter_ws[1]]
                    real.append(iter_ws[2])
                    logits.extend([w2v[A], w2v[B]])

            for i in range(len(logits) // 2):
                A_vec = logits[2 * i]
                B_vec = logits[2 * i + 1]
                normed_A_vec = LA.norm(A_vec, axis=0)
                normed_B_vec = LA.norm(B_vec, axis=0)
                sim = sum(np.multiply(A_vec, B_vec))
                eval.append(sim / normed_A_vec / normed_B_vec)

            pearsonr = stats.pearsonr(real, eval)[0]
            spearmanr = stats.spearmanr(real, eval).correlation
            return pearsonr, spearmanr

        if step_global % FLAGS.save_every_n == 0:
            end = time.time()
            print(
                "Training: Average loss at step {}: {};".format(
                    step_global, sum_loss / FLAGS.save_every_n),
                "time: {:.4f} sec;".format(end - start),
                "accuracy rate: {:.4f}".format(sum_acc_t / FLAGS.save_every_n))
            print(
                "Validation: Average loss: {};".format(sum_dev_loss /
                                                       FLAGS.save_every_n),
                "accuracy rate: {:.4f}".format(sum_acc_d / FLAGS.save_every_n))

            saver.save(sess,
                       FLAGS.ckpt_dir + "/step{}.ckpt".format(step_global))

            train_writer.add_summary(t_scalar, step_global)
            train_writer.add_summary(d_scalar, step_global)
            ac_scalar = tf.Summary(value=[
                tf.Summary.Value(tag="accuracy rate",
                                 simple_value=sum_acc_d / FLAGS.save_every_n)
            ])
            train_writer.add_summary(ac_scalar, step_global)
            # p_240, s_240 = eval_ws(reader.read_wordsim240())
            # p_297, s_297 = eval_ws(reader.read_wordsim297())
            # p_240_scalar = tf.Summary(value=[tf.Summary.Value(tag="ws240 pearsonr rate", simple_value=p_240)])
            # s_240_scalar = tf.Summary(value=[tf.Summary.Value(tag="ws240 spearmanr rate", simple_value=s_240)])
            # p_297_scalar = tf.Summary(value=[tf.Summary.Value(tag="ws297 pearsonr rate", simple_value=p_297)])
            # s_297_scalar = tf.Summary(value=[tf.Summary.Value(tag="ws297 spearmanr rate", simple_value=s_297)])
            # print("eval_ws240:")
            # print('pearsonr:%s' % p_240)
            # print('spearmanr:%s' % s_240)
            # print("eval_ws297:")
            # print('pearsonr:%s' % p_297)
            # print('spearmanr:%s' % s_297)
            # train_writer.add_summary(p_240_scalar, step_global)
            # train_writer.add_summary(s_240_scalar, step_global)
            # train_writer.add_summary(p_297_scalar, step_global)
            # train_writer.add_summary(s_297_scalar, step_global)

            sum_loss = 0
            sum_dev_loss = 0
            sum_acc_t = 0
            sum_acc_d = 0

            start = time.time()
Beispiel #6
0
def test(sess):
    # _, _, test_corpus = reader.read_corpus(index='1_0.2', pick_train=False, pick_valid=False, pick_test=True)
    # test_corpus, _, _ = reader.read_corpus(index=0, pick_train=True, pick_valid=False, pick_test=False)
    _, _, test_corpus = reader.read_corpus(index='klb_150',
                                           pick_train=False,
                                           pick_valid=False,
                                           pick_test=True)

    glossary, word2id = reader.read_glossary()

    test_inputs = []
    test_lenth = []
    test_labels = []
    test_num = 0
    for item in test_corpus:
        # test_inputs.append(item[1])
        # test_lenth.append(item[2])
        # if item[0] in [1, 'T', 1.0]:
        #     test_labels.append(1)
        # elif item[0] in [0, 'F', 0.0]:
        #     test_labels.append(0)
        # test_num += 1
        test_inputs.append(item[0][0])
        test_lenth.append(int(item[0][1]))
        if item[1] in [1, 'T', 1.0]:
            test_labels.append(1)
        elif item[1] in [0, 'F', 0.0]:
            test_labels.append(0)
        test_num += 1

    model = EncoderModel(batch_size=FLAGS.batch_size,
                         glossary_size=FLAGS.glossary_size,
                         embedding_size=FLAGS.embedding_size,
                         hidden_size=FLAGS.hidden_size,
                         attn_lenth=FLAGS.attn_lenth)
    model.build_test_graph(150)

    saver = tf.train.Saver()
    test_labels = np.reshape(test_labels, [test_num, 1])

    if restore_from_checkpoint(sess, saver, 'save/pt_bi_lstm_attn/1'):
        # test_loss, accuracy, expection, w2v, alpha = sess.run(
        #     [model.test_loss, model.test_accuracy, model.expection, model.embeddings, model.alpha],
        #                               feed_dict=test_feed_dict)
        total_test_loss = 0
        total_accuracy = 0
        total_expection = []
        for piece_inputs, piece_lenth, piece_labels in get_test_batches(
                test_inputs, test_lenth, test_labels, test_num):
            piece_num = len(piece_inputs)
            test_feed_dict = {
                model.test_inputs: piece_inputs,
                model.test_lenth: piece_lenth,
                model.test_labels: piece_labels
            }
            test_loss, accuracy, expection, w2v = sess.run(
                [
                    model.test_loss, model.test_accuracy, model.expection,
                    model.embeddings
                ],
                feed_dict=test_feed_dict)
            total_test_loss += test_loss * piece_num
            total_accuracy += accuracy * piece_num
            total_expection.extend(expection)
        total_test_loss /= test_num
        total_accuracy /= test_num

        for i in range(test_num):
            print(i, [glossary[word] for word in test_inputs[i]])
            print(test_inputs[i])
            # print(alpha[i])
            print(test_labels[i], total_expection[i])

        def f_value():
            # 真正例
            TP = 0
            # 假正例
            FP = 0
            # 假反例
            FN = 0
            # 真反例
            TN = 0

            # We pay more attention on negative samples.
            for i in range(test_num):
                if test_labels[i] == 0 and total_expection[i] == 0:
                    TP += 1
                elif test_labels[i] == 0 and total_expection[i] == 1:
                    FN += 1
                elif test_labels[i] == 1 and total_expection[i] == 0:
                    FP += 1
                elif test_labels[i] == 1 and total_expection[i] == 1:
                    TN += 1

            P = TP / (TP + FP + 0.0001)
            R = TP / (TP + FN + 0.0001)
            F = 2 * P * R / (P + R + 0.0001)
            P_ = TN / (TN + FN + 0.0001)
            R_ = TN / (TN + FP + 0.0001)
            F_ = 2 * P_ * R_ / (P_ + R_ + 0.0001)

            print("Validation: Average loss: {};".format(total_test_loss))
            print("     accuracy rate: {:.4f}".format(total_accuracy))
            print("About negative samples:")
            print("     precision rate: {:.4f}".format(P))
            print("     recall rate: {:.4f}".format(R))
            print("     f-value: {:.4f}".format(F))

            print("About positive samples:")
            print("     precision rate: {:.4f}".format(P_))
            print("     recall rate: {:.4f}".format(R_))
            print("     f-value: {:.4f}".format(F_))

        def eval_ws(ws_list):
            from scipy import stats
            from numpy import linalg as LA

            logits = []
            real = []
            eval = []

            for iter_ws in ws_list:
                if iter_ws[0] not in glossary or iter_ws[1] not in glossary:
                    continue
                else:
                    A = word2id[iter_ws[0]]
                    B = word2id[iter_ws[1]]
                    real.append(iter_ws[2])
                    logits.extend([w2v[A], w2v[B]])

            for i in range(len(logits) // 2):
                A_vec = logits[2 * i]
                B_vec = logits[2 * i + 1]
                normed_A_vec = LA.norm(A_vec, axis=0)
                normed_B_vec = LA.norm(B_vec, axis=0)
                sim = sum(np.multiply(A_vec, B_vec))
                eval.append(sim / normed_A_vec / normed_B_vec)
                # print(sim/normed_A_vec/normed_B_vec)

            print('pearsonr:%s' % (stats.pearsonr(real, eval)[0]))
            print('spearmanr:%s' % (stats.spearmanr(real, eval).correlation))

        f_value()
        eval_ws(reader.read_wordsim240())
        eval_ws(reader.read_wordsim297())

        return total_expection
    else:
        print("error!")