def train(): logging.info("Preparing summarization data.") docid, sumid, doc_dict, sum_dict = \ data_util.load_data( FLAGS.data_dir + "/train/train.article.txt", FLAGS.data_dir + "/train/train.title.txt", FLAGS.data_dir + "/doc_dict.txt", FLAGS.data_dir + "/sum_dict.txt", FLAGS.doc_vocab_size, FLAGS.sum_vocab_size) val_docid, val_sumid = \ data_util.load_valid_data( FLAGS.data_dir + "/train/valid.article.filter.txt", FLAGS.data_dir + "/train/valid.title.filter.txt", doc_dict, sum_dict) with tf.Session() as sess: # Create model. logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) train_writer = tf.summary.FileWriter(FLAGS.tfboard, sess.graph) model = create_model(sess, False) # Read data into buckets and compute their sizes. logging.info("Create buckets.") dev_set = create_bucket(val_docid, val_sumid) train_set = create_bucket(docid, sumid) train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes)) ] for (s_size, t_size), nsample in zip(_buckets, train_bucket_sizes): logging.info("Train set bucket ({}, {}) has {} samples.".format( s_size, t_size, nsample)) # This is the training loop. step_time, loss = 0.0, 0.0 current_step = sess.run(model.global_step) while current_step <= FLAGS.max_iter: random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, encoder_len, decoder_len = \ model.get_batch(train_set, bucket_id) step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, False, train_writer) step_time += (time.time() - start_time) / \ FLAGS.steps_per_validation loss += step_loss * FLAGS.batch_size / np.sum(decoder_len) \ / FLAGS.steps_per_validation current_step += 1 # Once in a while, we save checkpoint. if current_step % FLAGS.steps_per_checkpoint == 0: # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # Once in a while, we print statistics and run evals. if current_step % FLAGS.steps_per_validation == 0: # Print statistics for the previous epoch. perplexity = np.exp(float(loss)) logging.info("global step %d step-time %.2f ppl %.2f" % (model.global_step.eval(), step_time, perplexity)) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in range(len(_buckets)): if len(dev_set[bucket_id]) == 0: logging.info(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, encoder_len, decoder_len =\ model.get_batch(dev_set, bucket_id) eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, True) eval_loss = eval_loss * FLAGS.batch_size \ / np.sum(decoder_len) eval_ppx = np.exp(float(eval_loss)) logging.info(" eval: bucket %d ppl %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
sum_file = "data/modified_train_abstract.txt" vocab_file = "data/vocab" checkpoint_dir = "./save/quasi/checkpoints" checkpoint_prefix = os.path.join(checkpoint_dir, "baseline") dev_doc_file = "data/val_article.txt" dev_sum_file = "data/val_abstract.txt" # load source and target data docs, sums, vocab = load_data(doc_file, sum_file, vocab_file, max_vocab_size, debug=debug, max_num_tokens=max_num_tokens) dev_docs, dev_sums = load_valid_data(dev_doc_file, dev_sum_file, vocab, max_num_tokens, debug=debug) vocab_size = vocab.size() # self, vocab_size, embedding_size, state_size, num_layers, # decoder_vocab_size, attention_hidden_size, mode, beam_depth, # learning_rate, max_iter=100, attention_mode="Bahdanau"): def load_glove(glove_file, vocab, embedding_size): print("load pretrained glove from : {}".format(glove_file)) f = open(glove_file, "r", encoding="utf-8") lines = f.readlines() embedding = np.random.uniform(-0.25, 0.25, (vocab_size, embedding_size)) for line in lines: tokens = line.strip().split()
def train(): logging.info("Preparing summarization data.") docid, sumid, doc_dict, sum_dict, hidden_label= \ data_util.load_data( FLAGS.data_dir + "/train.48615.diff", FLAGS.data_dir + "/train.48615.msg", FLAGS.data_dir + "/doc_dict.txt", FLAGS.data_dir + "/sum_dict.txt", FLAGS.doc_vocab_size, FLAGS.sum_vocab_size) val_docid, val_sumid, val_hidd_label = \ data_util.load_valid_data( FLAGS.data_dir + "/valid.3000.diff", FLAGS.data_dir + "/valid.3000.msg", doc_dict, sum_dict) with tf.Session() as sess: # Create model. logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) train_writer = tf.summary.FileWriter(FLAGS.tfboard, sess.graph) model = create_model(sess, False) # Read data into buckets and compute their sizes. logging.info("Create buckets.") dev_set = create_bucket(val_docid, val_sumid, val_hidd_label) train_set = create_bucket(docid, sumid, hidden_label) train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes)) ] for (s_size, t_size, _), nsample in zip(_buckets, train_bucket_sizes): logging.info("Train set bucket ({}, {}) has {} samples.".format( s_size, t_size, nsample)) # This is the training loop. step_time, loss = 0.0, 0.0 current_step = sess.run(model.global_step) while current_step < FLAGS.max_iter: random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \ data_util.get_batch(train_set, _buckets, bucket_id, FLAGS.batch_size, False,0) step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, False, train_writer) step_time += (time.time() - start_time) / \ FLAGS.steps_per_validation loss += step_loss * FLAGS.batch_size / np.sum(decoder_len) \ / FLAGS.steps_per_validation current_step += 1 # Once in a while, we save checkpoint. if current_step % FLAGS.steps_per_checkpoint == 0: # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) # Once in a while, we print statistics and run evals. if current_step % FLAGS.steps_per_validation == 0: # Print statistics for the previous epoch. perplexity = np.exp(float(loss)) logging.info("global step %d step-time %.2f ppl %.2f" % (model.global_step.eval(), step_time, perplexity)) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in range(len(_buckets)): if len(dev_set[bucket_id]) == 0: logging.info(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\ data_util.get_batch(dev_set, _buckets, bucket_id, FLAGS.batch_size, False, 0) #cl_eval_loss, _ = class_model.step(sess, class_input, class_output, class_len, True) eval_loss, _, _ = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, True) eval_loss = eval_loss * FLAGS.batch_size \ / np.sum(decoder_len) eval_ppx = np.exp(float(eval_loss)) logging.info(" eval: bucket %d ppl %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush() #Get Encoder outputs batchidx = 0 final_inputs = [] final_outputs = [] final_len = [] while batchidx + FLAGS.batch_size <= train_bucket_sizes[0]: encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \ data_util.get_batch(train_set, _buckets, bucket_id, FLAGS.batch_size, True, batchidx) _, _, enc_outputs = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, True) enc_outputs = data_util.add_pad_for_hidden(enc_outputs, _buckets[0][0]) final_inputs.append(enc_outputs) final_outputs.append(class_output) final_len.append(class_len) batchidx += FLAGS.batch_size final_inputs = np.asarray(final_inputs) final_inputs = np.concatenate(final_inputs, 0) final_outputs = np.asarray(final_outputs) final_outputs = np.concatenate(final_outputs, 0) final_len = np.asarray(final_len) final_len = np.concatenate(final_len, 0) print(final_inputs.shape, final_outputs.shape, final_len.shape) #Hidden classifier class_model = create_class_model(sess, False) classification_curr_step = sess.run(class_model.global_step) i = 0 while classification_curr_step <= FLAGS.class_max_iter: _, step_loss, output = class_model.step(sess, final_inputs[i:(i + 160)], final_outputs[i:(i + 160)], final_len[i:(i + 160)], False) classification_curr_step += 1 clipped = np.array(output > 0.5, dtype=np.int) #print("i", i) #print("clfcurrstep",classification_curr_step) #print("clipped", clipped.flatten()) #print("final_outputs", final_outputs[i:(i+160)].flatten()) tn, fp, fn, tp = confusion_matrix( final_outputs[i:(i + 160)].flatten(), clipped.flatten()).ravel() if (classification_curr_step % 40 == 0): print("Train Precision", tp / (tp + fp + 0.1)) print("Train Accuracy", (tp + tn) / (tp + fp + tn + fn)) if (i + 160 == len(final_len)): i = 0 else: i += 160 # Once in a while, we save checkpoint. if classification_curr_step % FLAGS.steps_per_checkpoint == 0: # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.class_train_dir, "class_model.ckpt") class_model.saver.save(sess, checkpoint_path, global_step=class_model.global_step) print("test_file", FLAGS.test_file) docs, data = data_util.load_test_data(FLAGS.test_file, doc_dict) #test # Create model and load parameters. ''' logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) result = [] for idx, token_ids in enumerate(data): # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len =\ data_util.get_batch( {0: [(token_ids, [data_util.ID_GO, data_util.ID_EOS],[0,0])]}, _buckets, 0, FLAGS.batch_size, False, 0) if FLAGS.batch_size == 1 and FLAGS.geneos: loss, outputs = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, False) outputs = [np.argmax(item) for item in outputs[0]] else: outputs = model.step_beam( sess, encoder_inputs, encoder_len, geneos=FLAGS.geneos) # If there is an EOS symbol in outputs, cut them at that point. if data_util.ID_EOS in outputs: outputs = outputs[:outputs.index(data_util.ID_EOS)] gen_sum = " ".join(data_util.sen_map2tok(outputs, sum_dict[1])) gen_sum = data_util.sen_postprocess(gen_sum) result.append(gen_sum) logging.info("Finish {} samples. :: {}".format(idx, gen_sum[:75])) ''' #Get Encoder outputs docid, sumid, doc_dict, sum_dict, hidden_label= \ data_util.load_data( FLAGS.data_dir + "/test.1981.diff.txt", FLAGS.data_dir + "/test.1981.msg.txt", FLAGS.data_dir + "/doc_dict.txt", FLAGS.data_dir + "/sum_dict.txt", FLAGS.doc_vocab_size, FLAGS.sum_vocab_size) test_set = create_bucket(docid, sumid, hidden_label) test_bucket_sizes = [len(test_set[b]) for b in range(len(_buckets))] test_total_size = float(sum(test_bucket_sizes)) test_buckets_scale = [ sum(test_bucket_sizes[:i + 1]) / test_total_size for i in range(len(test_bucket_sizes)) ] batchidx = 0 final_inputs = [] final_outputs = [] final_len = [] #data.shape == (1, 158, 3) so I changed FLAGS.batch_size FLAGS.batch_size = 158 while batchidx + FLAGS.batch_size <= len(data): #bucket_id = (i for i in range(len(test_buckets_scale)) encoder_inputs, decoder_inputs, encoder_len, decoder_len, class_output, class_len = \ data_util.get_batch(test_set, _buckets, bucket_id, FLAGS.batch_size, True, batchidx) _, _, enc_outputs = model.step(sess, encoder_inputs, decoder_inputs, encoder_len, decoder_len, True) enc_outputs = data_util.add_pad_for_hidden(enc_outputs, _buckets[0][0]) final_inputs.append(enc_outputs) final_outputs.append(class_output) final_len.append(class_len) batchidx += 1 final_inputs = np.asarray(final_inputs) final_inputs = np.concatenate(final_inputs, 0) final_outputs = np.asarray(final_outputs) final_outputs = np.concatenate(final_outputs, 0) final_len = np.asarray(final_len) final_len = np.concatenate(final_len, 0) print(final_inputs.shape, final_outputs.shape, final_len.shape) #Hidden classifier step_loss, output = class_model.step(sess, final_inputs[:], final_outputs[:], final_len[:], True) clipped = np.array(output > 0.5, dtype=np.int) tn, fp, fn, tp = confusion_matrix(final_outputs[:].flatten(), clipped.flatten()).ravel() #with open('data/test.1981.msg.txt')as reader: # testmsg=[] # for i in range(1981): # testmsg.append(reader.readline()) #sums = list(map(lambda x: x.split(), testmsg)) #labels = data_util.hidden_label_gen(FLAGS.test_file, sums) #tn, fp, fn, tp = confusion_matrix(labels.flatten(), clipped.flatten()) print("Test Precision : ", tp / (tp + fp + 0.1)) print("Test Accuracy", (tp + tn) / (tp + fp + tn + fn)) with open(FLAGS.test_output, "w") as f: for idx in range(1981): for j in range(len(docs[idx])): if clipped[idx][j] == 1: print("Recommended identifier: " + docs[idx][j] + " ", file=f) print("\n", file=f)