def train():
    with tf.device('/cpu:0'):
        train_text, train_y,train_text_pos, train_e1, train_e2, train_pos1, train_pos2,train_sentence_len = data_helper2.load_data_and_labels(FLAGS.train_path)
    with tf.device('/cpu:0'):
        test_text, test_y, test_text_pos, test_e1, test_e2, test_pos1, test_pos2, test_sentence_len = data_helper2.load_data_and_labels(FLAGS.test_path)

    # Build vocabulary
    # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
    # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    # =>
    # [27 39 40 41 42  1 43  0  0 ... 0]
    # dimension = FLAGS.max_sentence_length
    # print("text:",x_text)
    # text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length)
    # x = np.array(list(text_vocab_processor.fit_transform(x_text)))#token
    # pretrain_W = utils.load_word2vec(FLAGS.embedding_path, FLAGS.text_embedding_dim, text_vocab_processor)
    # print("pretrain_w:",pretrain_W)
    # print(pretrain_W.shape) #(19151,300)
    # print("Text Vocabulary Size: {:d}".format(len(text_vocab_processor.vocabulary_)))
    # print("vocabulary:", text_vocab_processor.vocabulary_._reverse_mapping)
    # with open("vocabulary.txt","w",encoding="utf-8") as f:
    #     f.write(str(x))
    # print("x = {0}".format(x.shape)) #(8000,90)
    # print("y = {0}".format(y.shape)) #(8000,19)
    # print("")

    # Example: pos1[3] = [-2 -1  0  1  2   3   4 999 999 999 ... 999]
    # [95 96 97 98 99 100 101 999 999 999 ... 999]
    # =>
    # [11 12 13 14 15  16  21  17  17  17 ...  17]
    # dimension = MAX_SENTENCE_LENGTH
    # pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length)
    # pos_vocab_processor.fit(pos1 + pos2) #fit
    # print("pos vocab position:", pos_vocab_processor)
    # p1 = np.array(list(pos_vocab_processor.transform(pos1))) #tokens
    # print("p1:", p1)
    # p2 = np.array(list(pos_vocab_processor.transform(pos2)))
    # print("Position Vocabulary Size: {:d}".format(len(pos_vocab_processor.vocabulary_)))
    # with open("position.txt", "w", encoding="utf-8") as f:
    #         f.write(str(x))
    # print("position_1 = {0}".format(p1.shape)) #(8000,90)
    # print("position_2 = {0}".format(p2.shape)) #(8000,90)
    # print("")
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length)
    vocab_processor.fit(train_text_pos + test_text_pos)
    train_x_pos = np.array(list(vocab_processor.transform(train_text_pos)))
    test_x_pos = np.array(list(vocab_processor.transform(test_text_pos)))
    # train_text = np.array(train_text_pos)
    # test_text = np.array(test_text)
    print("\nText Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("train_x = {0}".format(train_x_pos.shape))
    print("train_y = {0}".format(train_y.shape))
    print("test_x = {0}".format(test_x_pos.shape))
    print("test_y = {0}".format(test_y.shape))

    # pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length)
    # pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2)
    # train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1)))
    # train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2)))
    # test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1)))
    # test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2)))
    # print("\nPosition Vocabulary Size: {:d}".format(len(pos_vocab_processor.vocabulary_)))
    # print("train_p1 = {0}".format(train_p1.shape))
    # print("test_p1 = {0}".format(test_p1.shape))
    # print("")

    x_text_to_id = {}
    id_to_x_text = {}
    id_train = []
    for i, str1 in enumerate(train_text):
        # print(str1)
        x_text_to_id[str1] = i
        id_to_x_text[i] = str1
        id_train.append(i)

    x_text_to_id = {}
    id_to_x_text = {}
    id_test = []
    for i, str1 in enumerate(test_text):
        x_text_to_id[str1] = i
        id_to_x_text[i] = str1
        id_test.append(i)

    # # Randomly shuffle data to split into train and test(dev)
    # np.random.seed(10)
    # x_text_to_id = {}
    # id_to_x_text = {}
    # id = []
    # for i, str1 in enumerate(x_text_clean):
    #     x_text_to_id[str1]=i
    #     id_to_x_text[i] = str1
    #     id.append(i)
    # # print(x_text_to_id)
    # # print(id_to_x_text)
    # # print(id[0:3])
    # print("id:",id)
    #
    # shuffle_indices = np.random.permutation(np.arange(len(y))) #len(y)=8000
    # id_shuffled = np.array(id)[shuffle_indices]
    #
    # # # p1_shuffled = p1[shuffle_indices]
    # # # p2_shuffled = p2[shuffle_indices]
    # y_shuffled = y[shuffle_indices]
    # # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled)
    #
    # # Split train/test set
    # # TODO: This is very crude, should use cross-validation
    # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train=7200, x_dev =800
    # id_train, id_dev = id_shuffled[:dev_sample_index], id_shuffled[dev_sample_index:]
    # # p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[dev_sample_index:]
    # # p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[dev_sample_index:]
    # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    # print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))
    # # x_train = [id_to_x_text[i] for i in id_train]
    # # x_dev = [id_to_x_text[i] for i in id_dev]
    # print("id_train:", id_train)
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=FLAGS.max_sentence_length, #90
                num_classes=train_y.shape[1],#19
                pos_vocab_size=len(vocab_processor.vocabulary_),
                pos_embedding_size = FLAGS.pos_embedding_dim,
                text_embedding_size=FLAGS.text_embedding_size,#300
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), #2,3,4,5
                num_heads=FLAGS.num_heads,
                num_filters=FLAGS.num_filters, #128
                l2_reg_lambda=FLAGS.l2_reg_lambda) #1e-5

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6)
            gvs = optimizer.compute_gradients(cnn.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs]
            train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Logger
            logger = Logger(out_dir)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
            # saver = tf.train.import_meta_graph('/home/wangyan/relation_extraction/runs/1556982398/checkpoints/model-85.9-70800.meta')
            # saver.restore(sess, tf.train.latest_checkpoint('./ckpt'))
            # Write vocabulary
            # text_vocab_processor.save(os.path.join(out_dir, "text_vocab"))
            # pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            # FLAGS._sess =sess

            # Pre-trained word2vec
            # if FLAGS.embedding_path:
            #     pretrain_W = utils.load_word2vec(FLAGS.embedding_path, FLAGS.text_embedding_dim, text_vocab_processor)
            #     sess.run(cnn.W_text.assign(pretrain_W))
            #     print("Success to load pre-trained word2vec model!\n")
            # print("id_train:", id_train.shape)
            # print("train_y", train_y.shape)
            id_train = np.array(id_train) #(8000,0)
            # print(id_train.shape)
            # print(id_train)
            # print(train_y.shape)
            # print(train_y)
            # print(list(zip(id_train, train_y)))
            # Generate batches
            batches = data_helper2.batch_iter(list(zip(train_text, train_y,train_x_pos)),
                                              FLAGS.batch_size, FLAGS.num_epochs)

            # Training loop. For each batch...
            best_f1 = 0.0  # For save checkpoint(model)
            # text_embedded_chars_dev = server_bert.load_clean_vector("embedding_unclean.npy", list(id_dev), sentence_len)

            # print("id_dev:",id_dev)
            # print(text_embedded_chars_dev.shape) #(800 90 768)
            for batch in batches:
                train_bx,  train_by, train_pos = zip(*batch)
                # print(x_batch)
                # print(list(x_batch))
                # print(len(x_batch)) #20
                # print(len(y_batch)) #20

                # Train
                # text_embedded_chars = server_bert.load_vector("embedding_unclean.npy", list(train_bx)) #[20 90 768]
                #print(text_embedded_chars.shape) #(20 90 768)
                text_embedded_chars = bc.encode(list(train_bx))
                feed_dict = {
                    cnn.text_embedded_chars: text_embedded_chars,
                    cnn.input_y: train_by,
                    cnn.input_pos: train_pos,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    cnn.emb_dropout_keep_prob: FLAGS.emb_dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run(
                    [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

                # Evaluation
                    # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    # Generate batches
                    test_batches = data_helper2.batch_iter(list(zip(test_text, test_y,test_x_pos)),
                                                           FLAGS.batch_size, 1, shuffle=False)
                    # Training loop. For each batch...
                    losses = 0.0
                    accuracy = 0.0
                    predictions = []
                    iter_cnt = 0
                    for test_batch in test_batches:
                        test_bx, test_by,test_pos = zip(*test_batch)
                        a = list(test_bx)
                        # print(a)
                        # test_text_embedded_chars = server_bert.load_vector("embedding_unclean_test.npy", list(test_bx))  # [20 90 768)
                        test_text_embedded_chars = bc.encode(list(test_bx))
                        feed_dict = {
                            cnn.text_embedded_chars: test_text_embedded_chars,
                            cnn.input_y:test_by,
                            cnn.input_pos: test_pos,
                            cnn.emb_dropout_keep_prob: 1.0,
                            cnn.dropout_keep_prob: 1.0
                        }
                        loss, acc, pred = sess.run(
                            [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict)
                        losses += loss
                        accuracy += acc
                        predictions += pred.tolist()
                        iter_cnt += 1
                    losses /= iter_cnt
                    accuracy /= iter_cnt
                    predictions = np.array(predictions, dtype='int')

                    logger.logging_eval(step, loss, accuracy, predictions)

                    # Model checkpoint
                    if best_f1 < logger.best_f1:
                        best_f1 = logger.best_f1
                        path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step)
                        print("Saved model checkpoint to {}\n".format(path))
def train():
    with tf.device('/cpu:0'):
        train_text, train_y, train_e1, train_e2, train_pos1, train_pos2, train_sentence_len = data_helper2.load_data_and_labels(
            FLAGS.train_path)
    with tf.device('/cpu:0'):
        test_text, test_y, test_e1, test_e2, test_pos1, test_pos2, test_sentence_len = data_helper2.load_data_and_labels(
            FLAGS.test_path)

    # Build vocabulary
    # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
    # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    # =>
    # [27 39 40 41 42  1 43  0  0 ... 0]
    # dimension = MAX_SENTENCE_LENGTH
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    vocab_processor.fit(train_text + test_text)
    train_x = np.array(list(vocab_processor.transform(train_text)))
    test_x = np.array(list(vocab_processor.transform(test_text)))
    # train_text = np.array(train_text)
    test_text = np.array(test_text)
    print("\nText Vocabulary Size: {:d}".format(
        len(vocab_processor.vocabulary_)))
    print("train_x = {0}".format(train_x.shape))
    print("train_y = {0}".format(train_y.shape))
    print("test_x = {0}".format(test_x.shape))
    print("test_y = {0}".format(test_y.shape))

    # Example: pos1[3] = [-2 -1  0  1  2   3   4 999 999 999 ... 999]
    # [95 96 97 98 99 100 101 999 999 999 ... 999]
    # =>
    # [11 12 13 14 15  16  21  17  17  17 ...  17]
    # dimension = MAX_SENTENCE_LENGTH
    pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2)
    train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1)))
    train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2)))
    test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1)))
    test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2)))
    print("\nPosition Vocabulary Size: {:d}".format(
        len(pos_vocab_processor.vocabulary_)))
    print("train_p1 = {0}".format(train_p1.shape))
    print("test_p1 = {0}".format(test_p1.shape))
    print("")

    x_text_to_id = {}
    id_to_x_text = {}
    id_train = []
    for i, str1 in enumerate(train_text):
        x_text_to_id[str1] = i
        id_to_x_text[i] = str1
        id_train.append(i)

    x_text_to_id = {}
    id_to_x_text = {}
    id_test = []
    for i, str1 in enumerate(test_text):
        x_text_to_id[str1] = i
        id_to_x_text[i] = str1
        id_test.append(i)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            model = TextCNN(
                sequence_length=90,
                num_classes=train_y.shape[1],
                text_embedding_size=FLAGS.text_embedding_size,  #300
                pos_vocab_size=len(pos_vocab_processor.vocabulary_),
                pos_embedding_size=FLAGS.pos_embedding_dim,  #50
                hidden_size=FLAGS.hidden_size,
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                num_filters=FLAGS.num_filters,
                num_heads=FLAGS.num_heads,
                attention_size=FLAGS.attention_size,
                l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate,
                                                   FLAGS.decay_rate, 1e-6)
            gvs = optimizer.compute_gradients(model.loss)
            capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var)
                          for grad, var in gvs]
            train_op = optimizer.apply_gradients(capped_gvs,
                                                 global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("\nWriting to {}\n".format(out_dir))

            # Logger
            logger = Logger(out_dir)

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", model.loss)
            acc_summary = tf.summary.scalar("accuracy", model.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            # vocab_processor.save(os.path.join(out_dir, "vocab"))
            # pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # if FLAGS.embeddings == "word2vec":
            #     pretrain_W = utils.load_word2vec('resource/GoogleNews-vectors-negative300.bin', FLAGS.embedding_size, vocab_processor)
            #     sess.run(model.W_text.assign(pretrain_W))
            #     print("Success to load pre-trained word2vec model!\n")
            # elif FLAGS.embeddings == "glove100":
            #     pretrain_W = utils.load_glove('resource/glove.6B.100d.txt', FLAGS.embedding_size, vocab_processor)
            #     sess.run(model.W_text.assign(pretrain_W))
            #     print("Success to load pre-trained glove100 model!\n")
            # elif FLAGS.embeddings == "glove300":
            #     pretrain_W = utils.load_glove('resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor)
            #     sess.run(model.W_text.assign(pretrain_W))
            #     print("Success to load pre-trained glove300 model!\n")

            # Generate batches
            train_batches = data_helper2.batch_iter(
                list(
                    zip(id_train, train_x, train_y, train_text, train_e1,
                        train_e2, train_p1, train_p2)), FLAGS.batch_size,
                FLAGS.num_epochs)
            # Training loop. For each batch...
            best_f1 = 0.0  # For save checkpoint(model)
            for train_batch in train_batches:
                train_bx, train_bx1, train_by, train_btxt, train_be1, train_be2, train_bp1, train_bp2 = zip(
                    *train_batch)
                text_embedded_chars = server_bert.load_clean_vector(
                    "/home/wangyan/relaton-extraction/embedding_unclean.npy",
                    list(train_bx), train_sentence_len)  # [20 90 768)
                feed_dict = {
                    model.text_embedded_chars: text_embedded_chars,
                    model.input_x: train_bx1,
                    model.input_y: train_by,
                    model.input_e1: train_be1,
                    model.input_e2: train_be2,
                    model.input_p1: train_bp1,
                    model.input_p2: train_bp2,
                    model.emb_dropout_keep_prob: FLAGS.emb_dropout_keep_prob,
                    model.rnn_dropout_keep_prob: FLAGS.rnn_dropout_keep_prob,
                    model.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, model.loss,
                    model.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    logger.logging_train(step, loss, accuracy)

                # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    # Generate batches
                    test_batches = data_helper2.batch_iter(list(
                        zip(id_test, test_x, test_y, test_text, test_e1,
                            test_e2, test_p1, test_p2)),
                                                           FLAGS.batch_size,
                                                           1,
                                                           shuffle=False)
                    # Training loop. For each batch...
                    losses = 0.0
                    accuracy = 0.0
                    predictions = []
                    iter_cnt = 0
                    for test_batch in test_batches:
                        test_bx, test_bx1, test_by, test_btxt, test_be1, test_be2, test_bp1, test_bp2 = zip(
                            *test_batch)
                        test_text_embedded_chars = server_bert.load_clean_vector(
                            "/home/wangyan/relaton-extraction/embedding_unclean_test.npy",
                            list(test_bx), test_sentence_len)  # [20 90 768)
                        feed_dict = {
                            model.text_embedded_chars:
                            test_text_embedded_chars,
                            model.input_x: test_bx1,
                            model.input_y: test_by,
                            model.input_e1: test_be1,
                            model.input_e2: test_be2,
                            model.input_p1: test_bp1,
                            model.input_p2: test_bp2,
                            model.emb_dropout_keep_prob: 1.0,
                            model.rnn_dropout_keep_prob: 1.0,
                            model.dropout_keep_prob: 1.0
                        }
                        loss, acc, pred = sess.run(
                            [model.loss, model.accuracy, model.predictions],
                            feed_dict)
                        losses += loss
                        accuracy += acc
                        predictions += pred.tolist()
                        iter_cnt += 1
                    losses /= iter_cnt
                    accuracy /= iter_cnt
                    predictions = np.array(predictions, dtype='int')

                    logger.logging_eval(step, loss, accuracy, predictions)

                    # Model checkpoint
                    if best_f1 < logger.best_f1:
                        best_f1 = logger.best_f1
                        path = saver.save(sess,
                                          checkpoint_prefix +
                                          "-{:.3g}".format(best_f1),
                                          global_step=step)
                        print("Saved model checkpoint to {}\n".format(path))