def train(): with tf.device('/cpu:0'): train_text, train_y,train_text_pos, train_e1, train_e2, train_pos1, train_pos2,train_sentence_len = data_helper2.load_data_and_labels(FLAGS.train_path) with tf.device('/cpu:0'): test_text, test_y, test_text_pos, test_e1, test_e2, test_pos1, test_pos2, test_sentence_len = data_helper2.load_data_and_labels(FLAGS.test_path) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = FLAGS.max_sentence_length # print("text:",x_text) # text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length) # x = np.array(list(text_vocab_processor.fit_transform(x_text)))#token # pretrain_W = utils.load_word2vec(FLAGS.embedding_path, FLAGS.text_embedding_dim, text_vocab_processor) # print("pretrain_w:",pretrain_W) # print(pretrain_W.shape) #(19151,300) # print("Text Vocabulary Size: {:d}".format(len(text_vocab_processor.vocabulary_))) # print("vocabulary:", text_vocab_processor.vocabulary_._reverse_mapping) # with open("vocabulary.txt","w",encoding="utf-8") as f: # f.write(str(x)) # print("x = {0}".format(x.shape)) #(8000,90) # print("y = {0}".format(y.shape)) #(8000,19) # print("") # Example: pos1[3] = [-2 -1 0 1 2 3 4 999 999 999 ... 999] # [95 96 97 98 99 100 101 999 999 999 ... 999] # => # [11 12 13 14 15 16 21 17 17 17 ... 17] # dimension = MAX_SENTENCE_LENGTH # pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length) # pos_vocab_processor.fit(pos1 + pos2) #fit # print("pos vocab position:", pos_vocab_processor) # p1 = np.array(list(pos_vocab_processor.transform(pos1))) #tokens # print("p1:", p1) # p2 = np.array(list(pos_vocab_processor.transform(pos2))) # print("Position Vocabulary Size: {:d}".format(len(pos_vocab_processor.vocabulary_))) # with open("position.txt", "w", encoding="utf-8") as f: # f.write(str(x)) # print("position_1 = {0}".format(p1.shape)) #(8000,90) # print("position_2 = {0}".format(p2.shape)) #(8000,90) # print("") vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length) vocab_processor.fit(train_text_pos + test_text_pos) train_x_pos = np.array(list(vocab_processor.transform(train_text_pos))) test_x_pos = np.array(list(vocab_processor.transform(test_text_pos))) # train_text = np.array(train_text_pos) # test_text = np.array(test_text) print("\nText Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("train_x = {0}".format(train_x_pos.shape)) print("train_y = {0}".format(train_y.shape)) print("test_x = {0}".format(test_x_pos.shape)) print("test_y = {0}".format(test_y.shape)) # pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(FLAGS.max_sentence_length) # pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2) # train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1))) # train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2))) # test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1))) # test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2))) # print("\nPosition Vocabulary Size: {:d}".format(len(pos_vocab_processor.vocabulary_))) # print("train_p1 = {0}".format(train_p1.shape)) # print("test_p1 = {0}".format(test_p1.shape)) # print("") x_text_to_id = {} id_to_x_text = {} id_train = [] for i, str1 in enumerate(train_text): # print(str1) x_text_to_id[str1] = i id_to_x_text[i] = str1 id_train.append(i) x_text_to_id = {} id_to_x_text = {} id_test = [] for i, str1 in enumerate(test_text): x_text_to_id[str1] = i id_to_x_text[i] = str1 id_test.append(i) # # Randomly shuffle data to split into train and test(dev) # np.random.seed(10) # x_text_to_id = {} # id_to_x_text = {} # id = [] # for i, str1 in enumerate(x_text_clean): # x_text_to_id[str1]=i # id_to_x_text[i] = str1 # id.append(i) # # print(x_text_to_id) # # print(id_to_x_text) # # print(id[0:3]) # print("id:",id) # # shuffle_indices = np.random.permutation(np.arange(len(y))) #len(y)=8000 # id_shuffled = np.array(id)[shuffle_indices] # # # # p1_shuffled = p1[shuffle_indices] # # # p2_shuffled = p2[shuffle_indices] # y_shuffled = y[shuffle_indices] # # print(x_shuffled, p1_shuffled,p2_shuffled,y_shuffled) # # # Split train/test set # # TODO: This is very crude, should use cross-validation # dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #x_train=7200, x_dev =800 # id_train, id_dev = id_shuffled[:dev_sample_index], id_shuffled[dev_sample_index:] # # p1_train, p1_dev = p1_shuffled[:dev_sample_index], p1_shuffled[dev_sample_index:] # # p2_train, p2_dev = p2_shuffled[:dev_sample_index], p2_shuffled[dev_sample_index:] # y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] # print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) # # x_train = [id_to_x_text[i] for i in id_train] # # x_dev = [id_to_x_text[i] for i in id_dev] # print("id_train:", id_train) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=FLAGS.max_sentence_length, #90 num_classes=train_y.shape[1],#19 pos_vocab_size=len(vocab_processor.vocabulary_), pos_embedding_size = FLAGS.pos_embedding_dim, text_embedding_size=FLAGS.text_embedding_size,#300 filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), #2,3,4,5 num_heads=FLAGS.num_heads, num_filters=FLAGS.num_filters, #128 l2_reg_lambda=FLAGS.l2_reg_lambda) #1e-5 # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(cnn.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Logger logger = Logger(out_dir) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # saver = tf.train.import_meta_graph('/home/wangyan/relation_extraction/runs/1556982398/checkpoints/model-85.9-70800.meta') # saver.restore(sess, tf.train.latest_checkpoint('./ckpt')) # Write vocabulary # text_vocab_processor.save(os.path.join(out_dir, "text_vocab")) # pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # FLAGS._sess =sess # Pre-trained word2vec # if FLAGS.embedding_path: # pretrain_W = utils.load_word2vec(FLAGS.embedding_path, FLAGS.text_embedding_dim, text_vocab_processor) # sess.run(cnn.W_text.assign(pretrain_W)) # print("Success to load pre-trained word2vec model!\n") # print("id_train:", id_train.shape) # print("train_y", train_y.shape) id_train = np.array(id_train) #(8000,0) # print(id_train.shape) # print(id_train) # print(train_y.shape) # print(train_y) # print(list(zip(id_train, train_y))) # Generate batches batches = data_helper2.batch_iter(list(zip(train_text, train_y,train_x_pos)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) # text_embedded_chars_dev = server_bert.load_clean_vector("embedding_unclean.npy", list(id_dev), sentence_len) # print("id_dev:",id_dev) # print(text_embedded_chars_dev.shape) #(800 90 768) for batch in batches: train_bx, train_by, train_pos = zip(*batch) # print(x_batch) # print(list(x_batch)) # print(len(x_batch)) #20 # print(len(y_batch)) #20 # Train # text_embedded_chars = server_bert.load_vector("embedding_unclean.npy", list(train_bx)) #[20 90 768] #print(text_embedded_chars.shape) #(20 90 768) text_embedded_chars = bc.encode(list(train_bx)) feed_dict = { cnn.text_embedded_chars: text_embedded_chars, cnn.input_y: train_by, cnn.input_pos: train_pos, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob, cnn.emb_dropout_keep_prob: FLAGS.emb_dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run( [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) # Evaluation # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") # Generate batches test_batches = data_helper2.batch_iter(list(zip(test_text, test_y,test_x_pos)), FLAGS.batch_size, 1, shuffle=False) # Training loop. For each batch... losses = 0.0 accuracy = 0.0 predictions = [] iter_cnt = 0 for test_batch in test_batches: test_bx, test_by,test_pos = zip(*test_batch) a = list(test_bx) # print(a) # test_text_embedded_chars = server_bert.load_vector("embedding_unclean_test.npy", list(test_bx)) # [20 90 768) test_text_embedded_chars = bc.encode(list(test_bx)) feed_dict = { cnn.text_embedded_chars: test_text_embedded_chars, cnn.input_y:test_by, cnn.input_pos: test_pos, cnn.emb_dropout_keep_prob: 1.0, cnn.dropout_keep_prob: 1.0 } loss, acc, pred = sess.run( [cnn.loss, cnn.accuracy, cnn.predictions], feed_dict) losses += loss accuracy += acc predictions += pred.tolist() iter_cnt += 1 losses /= iter_cnt accuracy /= iter_cnt predictions = np.array(predictions, dtype='int') logger.logging_eval(step, loss, accuracy, predictions) # Model checkpoint if best_f1 < logger.best_f1: best_f1 = logger.best_f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))
def train(): with tf.device('/cpu:0'): train_text, train_y, train_e1, train_e2, train_pos1, train_pos2, train_sentence_len = data_helper2.load_data_and_labels( FLAGS.train_path) with tf.device('/cpu:0'): test_text, test_y, test_e1, test_e2, test_pos1, test_pos2, test_sentence_len = data_helper2.load_data_and_labels( FLAGS.test_path) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = MAX_SENTENCE_LENGTH vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) vocab_processor.fit(train_text + test_text) train_x = np.array(list(vocab_processor.transform(train_text))) test_x = np.array(list(vocab_processor.transform(test_text))) # train_text = np.array(train_text) test_text = np.array(test_text) print("\nText Vocabulary Size: {:d}".format( len(vocab_processor.vocabulary_))) print("train_x = {0}".format(train_x.shape)) print("train_y = {0}".format(train_y.shape)) print("test_x = {0}".format(test_x.shape)) print("test_y = {0}".format(test_y.shape)) # Example: pos1[3] = [-2 -1 0 1 2 3 4 999 999 999 ... 999] # [95 96 97 98 99 100 101 999 999 999 ... 999] # => # [11 12 13 14 15 16 21 17 17 17 ... 17] # dimension = MAX_SENTENCE_LENGTH pos_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) pos_vocab_processor.fit(train_pos1 + train_pos2 + test_pos1 + test_pos2) train_p1 = np.array(list(pos_vocab_processor.transform(train_pos1))) train_p2 = np.array(list(pos_vocab_processor.transform(train_pos2))) test_p1 = np.array(list(pos_vocab_processor.transform(test_pos1))) test_p2 = np.array(list(pos_vocab_processor.transform(test_pos2))) print("\nPosition Vocabulary Size: {:d}".format( len(pos_vocab_processor.vocabulary_))) print("train_p1 = {0}".format(train_p1.shape)) print("test_p1 = {0}".format(test_p1.shape)) print("") x_text_to_id = {} id_to_x_text = {} id_train = [] for i, str1 in enumerate(train_text): x_text_to_id[str1] = i id_to_x_text[i] = str1 id_train.append(i) x_text_to_id = {} id_to_x_text = {} id_test = [] for i, str1 in enumerate(test_text): x_text_to_id[str1] = i id_to_x_text[i] = str1 id_test.append(i) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) session_conf.gpu_options.allow_growth = FLAGS.gpu_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): model = TextCNN( sequence_length=90, num_classes=train_y.shape[1], text_embedding_size=FLAGS.text_embedding_size, #300 pos_vocab_size=len(pos_vocab_processor.vocabulary_), pos_embedding_size=FLAGS.pos_embedding_dim, #50 hidden_size=FLAGS.hidden_size, filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, num_heads=FLAGS.num_heads, attention_size=FLAGS.attention_size, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdadeltaOptimizer(FLAGS.learning_rate, FLAGS.decay_rate, 1e-6) gvs = optimizer.compute_gradients(model.loss) capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs] train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("\nWriting to {}\n".format(out_dir)) # Logger logger = Logger(out_dir) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", model.loss) acc_summary = tf.summary.scalar("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary # vocab_processor.save(os.path.join(out_dir, "vocab")) # pos_vocab_processor.save(os.path.join(out_dir, "pos_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # if FLAGS.embeddings == "word2vec": # pretrain_W = utils.load_word2vec('resource/GoogleNews-vectors-negative300.bin', FLAGS.embedding_size, vocab_processor) # sess.run(model.W_text.assign(pretrain_W)) # print("Success to load pre-trained word2vec model!\n") # elif FLAGS.embeddings == "glove100": # pretrain_W = utils.load_glove('resource/glove.6B.100d.txt', FLAGS.embedding_size, vocab_processor) # sess.run(model.W_text.assign(pretrain_W)) # print("Success to load pre-trained glove100 model!\n") # elif FLAGS.embeddings == "glove300": # pretrain_W = utils.load_glove('resource/glove.840B.300d.txt', FLAGS.embedding_size, vocab_processor) # sess.run(model.W_text.assign(pretrain_W)) # print("Success to load pre-trained glove300 model!\n") # Generate batches train_batches = data_helper2.batch_iter( list( zip(id_train, train_x, train_y, train_text, train_e1, train_e2, train_p1, train_p2)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... best_f1 = 0.0 # For save checkpoint(model) for train_batch in train_batches: train_bx, train_bx1, train_by, train_btxt, train_be1, train_be2, train_bp1, train_bp2 = zip( *train_batch) text_embedded_chars = server_bert.load_clean_vector( "/home/wangyan/relaton-extraction/embedding_unclean.npy", list(train_bx), train_sentence_len) # [20 90 768) feed_dict = { model.text_embedded_chars: text_embedded_chars, model.input_x: train_bx1, model.input_y: train_by, model.input_e1: train_be1, model.input_e2: train_be2, model.input_p1: train_bp1, model.input_p2: train_bp2, model.emb_dropout_keep_prob: FLAGS.emb_dropout_keep_prob, model.rnn_dropout_keep_prob: FLAGS.rnn_dropout_keep_prob, model.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, model.loss, model.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: logger.logging_train(step, loss, accuracy) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") # Generate batches test_batches = data_helper2.batch_iter(list( zip(id_test, test_x, test_y, test_text, test_e1, test_e2, test_p1, test_p2)), FLAGS.batch_size, 1, shuffle=False) # Training loop. For each batch... losses = 0.0 accuracy = 0.0 predictions = [] iter_cnt = 0 for test_batch in test_batches: test_bx, test_bx1, test_by, test_btxt, test_be1, test_be2, test_bp1, test_bp2 = zip( *test_batch) test_text_embedded_chars = server_bert.load_clean_vector( "/home/wangyan/relaton-extraction/embedding_unclean_test.npy", list(test_bx), test_sentence_len) # [20 90 768) feed_dict = { model.text_embedded_chars: test_text_embedded_chars, model.input_x: test_bx1, model.input_y: test_by, model.input_e1: test_be1, model.input_e2: test_be2, model.input_p1: test_bp1, model.input_p2: test_bp2, model.emb_dropout_keep_prob: 1.0, model.rnn_dropout_keep_prob: 1.0, model.dropout_keep_prob: 1.0 } loss, acc, pred = sess.run( [model.loss, model.accuracy, model.predictions], feed_dict) losses += loss accuracy += acc predictions += pred.tolist() iter_cnt += 1 losses /= iter_cnt accuracy /= iter_cnt predictions = np.array(predictions, dtype='int') logger.logging_eval(step, loss, accuracy, predictions) # Model checkpoint if best_f1 < logger.best_f1: best_f1 = logger.best_f1 path = saver.save(sess, checkpoint_prefix + "-{:.3g}".format(best_f1), global_step=step) print("Saved model checkpoint to {}\n".format(path))