def gen_barrage_wechcat(raw_input, model): raw_input = pro_sentpiece(raw_input, hp.bpe_model) bar_input = [] bar_input.append(raw_input) test_batches = input_fn(bar_input, bar_input, hp.barrages_vocab, 1, shuffle=False) iter = tf.data.Iterator.from_structure(test_batches.output_types, test_batches.output_shapes) xs, ys = iter.get_next() test_init_op = iter.make_initializer(test_batches) y_hat, _, random_predict = model.eval_gen(xs, ys) with tf.Session() as sess: ckpt_ = tf.train.latest_checkpoint(hp.ckpt) ckpt = hp.ckpt if ckpt_ is None else ckpt_ # None: ckpt is a file. otherwise dir. saver = tf.train.Saver() saver.restore(sess, ckpt) sess.run(test_init_op) logging.info("# get hypotheses") hypotheses, yy = get_hypotheses(1, 1, sess, y_hat, model.idx2token) logging.info("# write results") logging.info(hypotheses) logging.info("# Done") return "".join(hypotheses)
def test(hp): # Loading hyper params load_hparams(hp, hp.ckpt) logging.info("# Prepare test batches") test_batches, num_test_batches, num_test_samples = get_batch( hp.test1, hp.test1, 100000, 100000, hp.vocab, hp.test_batch_size, shuffle=False) iter = tf.data.Iterator.from_structure(test_batches.output_types, test_batches.output_shapes) xs, ys = iter.get_next() test_init_op = iter.make_initializer(test_batches) logging.info("# Load model") model = Transformer(hp) logging.info("# Session") with tf.Session() as sess: ckpt_ = tf.train.latest_checkpoint(hp.ckpt) ckpt = ckpt_ if ckpt_ else hp.ckpt saver = tf.train.Saver() saver.restore(sess, ckpt) y_hat, mean_loss = model.eval(sess, test_init_op, xs, ys, num_test_batches) logging.info("# get hypotheses") hypotheses = get_hypotheses(num_test_samples, y_hat, model.idx2token) logging.info("# write results") model_output = os.path.split(ckpt)[-1] if not os.path.exists(hp.testdir): os.makedirs(hp.testdir) translation = os.path.join(hp.testdir, model_output) with open(translation, 'w', encoding="utf-8") as fout: fout.write("\n".join(hypotheses)) logging.info("# calc bleu score and append it to translation") calc_bleu_nltk(hp.test2, translation)
sess.run(train_init_op) for i in tqdm(range(_gs, total_steps+1)): _, _gs, _summary = sess.run([train_op, global_step, train_summaries]) epoch = _gs // train_num_batches summary_writer.add_summary(_summary, _gs) if _gs and _gs % train_num_batches == 0: logging.info("# Epoch {} is done".format(epoch)) _loss = sess.run(loss) logging.info("# Test evaluation") _, _eval_summary = sess.run([eval_init_op, eval_summaries]) summary_writer.add_summary(_eval_summary, epoch) logging.info("# Get hypotheses") hypotheses = get_hypotheses(num_eval_batches, num_eval_samples, sess, y_hat, m.idx2token) logging.info("# Write results") model_output = "iwslt2016_E%02dL%.2f" % (epoch, _loss) if not os.path.exsits(hp.evaldir): os.makedirs(hp.evaldir) translation = os.path.join(hp.evaldir, model_output) with open(translation, "w") as f: f.write("\n".join(hypotheses)) logging.info("# Calc bleu score and append it to translation") calc_bleu(hp.eval3, translation) logging.info("# Save model") ckpt_name = os.ptah.join(hp.logdir, model_output) saver.save(sess, ckpt_name, global_step=_gs) logging.info("After training of {} epochs, {} has been saved.".format(epoch, ckpt_name))
def calculate_average(dict_accumulator, number_of_iterations): for key in dict_accumulator: dict_accumulator[key] = dict_accumulator[key] / number_of_iterations return dict_accumulator #enddef if __name__ == "__main__": final_dict_accumulator = {} number_of_iterations = 10 exp_iter = 0 #input all hypotheses hypotheses = utils.get_hypotheses(hypotheses_file) #get all examples examples = utils.input_examples(ds) # initial distribution Q_0 = np.ones(len(hypotheses)) / len(hypotheses) for iteration in range(0, number_of_iterations): #Define deltas deltas_for_noise_feature = np.arange(0, 0.201, 0.02) teacher_type = "noise_feature" #"noise_feature" #limited_ground_truth accumulator = {} for delta in deltas_for_noise_feature: delta = np.round(delta, 2) teachers_examples = utils.get_teachers_examples(
logging.info("# Load model") m = Transformer(hp) y_hat, _, refs = m.eval(xs, ys) logging.info("# Session") with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(hp.modeldir) saver = tf.train.Saver() saver.restore(sess, ckpt) sess.run(test_init_op) logging.info("# get hypotheses") hypotheses, refs_result = get_hypotheses(num_test_batches, num_test_samples, sess, y_hat, refs, m.idx2token) # 将原始的结果写到本地 logging.info("write references") result_output = "refs" if not os.path.exists(hp.test_result): os.makedirs(hp.test_result) ref_path = os.path.join(hp.test_result, result_output) with open(ref_path, 'w', encoding='utf-8') as fout: _refs = [] for r in refs_result: words = r.decode('utf-8').split() s = [word.replace("▁", " ") for word in words] # remove bpe symbols sent = ''.join(s) _refs.append(sent.strip())
from utils import get_hypotheses from collections import Counter import nltk from nltk.tag import pos_tag, map_tag from nltk.corpus import wordnet as wn import numpy, pickle HYP_FILE = "./data/train-test.hyp1-hyp2-ref" GOLD_FILE = "./data/train.gold" OUTPUT_FILE = "./output-exp.pred" TRADEOFF_PARAM = 0.35# between 0 and 1 [labeled_instances, unlabeled_instances] = get_hypotheses(HYP_FILE, GOLD_FILE) def memoize(func): memodict = {} def inner(stuff): if stuff not in memodict: memodict[stuff] = func(stuff) return memodict[stuff] return inner # nltk can't pos tag unicode def preprocess(text): return ''.join(character for character in text if ord(character)<128) wntags = {'NOUN':'n','VERB':'v','ADJ':'a','ADV':'r'} @memoize def get_word_count_dict(sentence): wcount_dict = Counter()
xs_output, ys_output, handle_output, _, _gs, _summary = sess.run( [xs, ys, handle, train_op, global_step, train_summaries], feed_dict={handle: training_handle}) summary_writer.add_summary(_summary, _gs) if _gs % (hp.gpu_nums * 5000) == 0 and _gs != 0: logging.info("steps {} is done".format(_gs)) logging.info("# test evaluation") sess.run(val_iter.initializer) # initial val dataset _eval_summaries = sess.run(eval_summaries, feed_dict={handle: val_handle}) summary_writer.add_summary(_eval_summaries, _gs) logging.info("# beam search") hypotheses, all_targets = get_hypotheses(num_eval_batches, num_eval_samples, sess, m, bs, [xs[0], ys[2]], handle, val_handle) logging.info("# calc rouge score ") if not os.path.exists(hp.evaldir): os.makedirs(hp.evaldir) rouge_l = calc_rouge(rouge, all_targets, hypotheses, _gs, hp.evaldir) model_output = "trans_pointer%02dL%.2f" % (_gs, rouge_l) logging.info('# write hypotheses') with open(os.path.join(hp.evaldir, model_output), 'w', encoding='utf-8') as f: for target, hypothes in zip(all_targets, hypotheses): f.write('{}-{} \n'.format(target, ' '.join(hypothes)))
logging.info("# Session") saver = tf.train.Saver() with tf.Session() as sess: #ckpt_ = tf.train.latest_checkpoint(hp.ckpt) #ckpt = hp.ckpt if ckpt_ is None else ckpt_ # None: ckpt is a file. otherwise dir. ckpt = hp.ckpt #saver = tf.train.import_meta_graph('/home/shuangzhao/transformer_1/mt_log/g213_2/NMT_E6_L2.444_lr7e-05_-73488.meta') saver.restore(sess, ckpt) sess.run(test_init_op) logging.info("# get hypotheses") t1 = time.time() hypotheses = get_hypotheses(num_test_batches, num_test_samples, sess, y_hat, m.en_idx2token) t2 = time.time() print("-" * 80) print(f"Time for getting results: {t2 - t1}") logging.info("# write results") model_output = ckpt.split("/")[-1] if not os.path.exists(hp.testdir): os.makedirs(hp.testdir) translation = os.path.join(hp.testdir, model_output) with open(translation, 'w') as fout: fout.write("\n".join(hypotheses)) logging.info("# calc bleu score and append it to translation") #calc_bleu(hp.test2, translation) print(f"Time rest: {time.time() - t2}")
def train_(self, epochs): train_batches, num_train_batches, num_train_samples = get_batch( '../data/iwslt2016/segmented/train.de.bpe', '../data/iwslt2016/segmented/train.en.bpe', self.sequence_length, self.sequence_length, self.vocab_file, self.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch( '../data/iwslt2016/segmented/eval.de.bpe', '../data/iwslt2016/segmented/eval.en.bpe', 100000, 100000, self.vocab_file, self.batch_size, shuffle=False) iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) xs, ys = iter.get_next() train_init_op = iter.make_initializer(train_batches) eval_init_op = iter.make_initializer(eval_batches) loss, train_op, global_step, train_summaries = self.model.train(xs, ys) y_hat, eval_summaries = self.model.eval(xs, ys) logging.info("# Session") with tf.Session() as sess: ckpt = tf.train.latest_checkpoint(self.model_dir) if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) save_variable_specs(os.path.join('../data/log/1', "specs")) else: self.saver.restore(sess, ckpt) summary_writer = tf.summary.FileWriter(self.model_dir, sess.graph) sess.run(train_init_op) total_steps = epochs * num_train_batches _gs = sess.run(global_step) for i in tqdm(range(_gs, total_steps + 1)): _, _gs, _summary = sess.run( [train_op, global_step, train_summaries]) epoch = math.ceil(_gs / num_train_batches) summary_writer.add_summary(_summary, _gs) if _gs and _gs % num_train_batches == 0: logging.info("epoch {} is done".format(epoch)) _loss = sess.run(loss) # train loss logging.info("# test evaluation") _, _eval_summaries = sess.run( [eval_init_op, eval_summaries]) summary_writer.add_summary(_eval_summaries, _gs) logging.info("# get hypotheses") hypotheses = get_hypotheses(num_eval_batches, num_eval_samples, sess, y_hat, self.model.index_char) logging.info("# write results") model_output = "iwslt2016_E%02dL%.2f" % (epoch, _loss) if not os.path.exists('data/eval/1'): os.makedirs('../data/eval/1') translation = os.path.join('../data/eval/1', model_output) with open(translation, 'w') as fout: fout.write("\n".join(hypotheses)) logging.info( "# calc bleu score and append it to translation") calc_bleu('../data/iwslt2016/prepro/eval.en', translation) logging.info("# save models") self.saver.save(sess, os.path.join(self.model_dir, 'transformer.dat'), global_step=_gs) sess.run(train_init_op) summary_writer.close() logging.info("Done")
def train(hp): save_hparams(hp, hp.checkpoints_dir) # Data generator logging.info("Prepare Train/Eval batches...") train_batches, num_train_batches, num_train_samples = get_batch( hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab, hp.batch_size, shuffle=True) eval_batches, num_eval_batches, num_eval_samples = get_batch(hp.eval1, hp.eval2, 10000, 10000, hp.vocab, hp.batch_size, shuffle=False) # Batch iterator iter = tf.data.Iterator.from_structure(train_batches.output_types, train_batches.output_shapes) xs, ys = iter.get_next() train_init_op = iter.make_initializer(train_batches) eval_init_op = iter.make_initializer(eval_batches) # Build model logging.info("Build model...") model = Transformer(hp) logging.info("Model is built!") # Session logging.info("Session initialize") saver = tf.train.Saver(max_to_keep=5) with tf.Session() as sess: # Check & Load latest version model checkpoint ckpt = tf.train.latest_checkpoint(hp.checkpoints_dir) if ckpt is None: logging.info("Initializing from scratch") sess.run(tf.global_variables_initializer()) save_variable_specs(os.path.join(hp.checkpoints_dir, "specs")) else: saver.restore(sess, ckpt) summary_writer = tf.summary.FileWriter(hp.checkpoints_dir, sess.graph) sess.run(train_init_op) total_steps = hp.num_epochs * num_train_batches _gs = sess.run(model.global_step) k = 5 min_dev_loss = 0 stop_alpha = 20.0 eval_losses = [] # Start training for i in tqdm(range(_gs, total_steps + 1)): _input_x, _decoder_input, _target = sess.run([xs[0], ys[0], ys[1]]) _, _gs, _summary = sess.run( [model.train_op, model.global_step, model.summaries], feed_dict={ model.input_x: _input_x, model.decoder_input: _decoder_input, model.target: _target, model.is_training: True }) epoch = math.ceil(_gs / num_train_batches) summary_writer.add_summary(_summary, _gs) # Evaluation if _gs and _gs % num_train_batches == 0: logging.info("Epoch {} is done".format(epoch)) _loss = sess.run(model.loss, feed_dict={ model.input_x: _input_x, model.decoder_input: _decoder_input, model.target: _target, model.is_training: False }) # evaluation y_hat, mean_loss = model.eval(sess, eval_init_op, xs, ys, num_eval_batches) # id to token logging.info("# Get hypotheses") hypotheses = get_hypotheses(num_eval_samples, y_hat, model.idx2token) # save translation results if not os.path.exists(hp.evaldir): os.makedirs(hp.evaldir) logging.info("# Write results") model_output = "translation_E{:02d}L{:.2f}EL{:.2f}".format( epoch, _loss, mean_loss) translation = os.path.join(hp.evaldir, model_output) with open(translation, 'w', encoding="utf-8") as fout: fout.write("\n".join(hypotheses)) logging.info( "# Calculate bleu score and append it to translation") # bleu calc_bleu_nltk(hp.eval2, translation) # save model logging.info("# Save models") ckpt_name = os.path.join(hp.checkpoints_dir, model_output) saver.save(sess, ckpt_name, global_step=_gs) logging.info( "After training of {} epochs, {} has been saved.".format( epoch, ckpt_name)) # claculate early stop if len(eval_losses) == 0: min_dev_loss = mean_loss eval_losses.append(mean_loss) gl, p_k, pq_alpha = calculate_earlystop_baseline( mean_loss, min_dev_loss, eval_losses, k) min_dev_loss = mean_loss if mean_loss < min_dev_loss else min_dev_loss eval_losses = eval_losses[-k:] logging.info( "GL(t): {:.4f}, P_k: {:.4f}, PQ_alpha: {:.4f}".format( gl, p_k, pq_alpha)) if gl > stop_alpha: logging.info( "No optimization for a long time, auto-stopping...") break # change data iterator back to train iterator sess.run(train_init_op) summary_writer.close() logging.info("Done")