def calculate_scores(): hyp_fn, ref_fn = 'tmp.%s.src' % mode, 'tmp.%s.tgt' % mode write_token_id_arrays_to_text_file(hypotheses, os.path.join(model_dir, hyp_fn), tokenizer) write_token_id_arrays_to_text_file(references, os.path.join(model_dir, ref_fn), tokenizer) hyp_fn, ref_fn = os.path.join(model_dir, hyp_fn), os.path.join( model_dir, ref_fn) files_rouge = FilesRouge(hyp_fn, ref_fn) rouge_scores = files_rouge.get_scores(avg=True) bleu_score = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True) return rouge_scores, bleu_score
def _eval_epoch(sess, epoch, mode): if mode == 'eval': eval_data = dev_data elif mode == 'test': eval_data = test_data else: raise ValueError('`mode` should be either "eval" or "test".') references, hypotheses = [], [] bsize = config_data.test_batch_size for i in range(0, len(eval_data), bsize): #print("eval {}/{}".format(i, len(eval_data))) sources, targets = zip(*eval_data[i:i + bsize]) x_block = data_utils.source_pad_concat_convert(sources) feed_dict = { encoder_input: x_block, tx.global_mode(): tf.estimator.ModeKeys.EVAL, } fetches = { 'inferred_ids': inferred_ids, } fetches_ = sess.run(fetches, feed_dict=feed_dict) hypotheses.extend(h.tolist() for h in fetches_['inferred_ids']) references.extend(r.tolist() for r in targets) hypotheses = utils.list_strip_eos(hypotheses, eos_token_id) references = utils.list_strip_eos(references, eos_token_id) if mode == 'eval': # Writes results to files to evaluate BLEU # For 'eval' mode, the BLEU is based on token ids (rather than # text tokens) and serves only as a surrogate metric to monitor # the training process fname = os.path.join(FLAGS.model_dir, 'tmp.eval') hypotheses = tx.utils.str_join(hypotheses) references = tx.utils.str_join(references) hyp_fn, ref_fn = tx.utils.write_paired_text(hypotheses, references, fname, mode='s') eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True) eval_bleu = 100. * eval_bleu logger.info('epoch: %d, eval_bleu %.4f', epoch, eval_bleu) print('epoch: %d, eval_bleu %.4f' % (epoch, eval_bleu)) if eval_bleu > best_results['score']: logger.info('epoch: %d, best bleu: %.4f', epoch, eval_bleu) best_results['score'] = eval_bleu best_results['epoch'] = epoch model_path = os.path.join(FLAGS.model_dir, 'best-model.ckpt') logger.info('saving model to %s', model_path) print('saving model to %s' % model_path) saver.save(sess, model_path) elif mode == 'test': # For 'test' mode, together with the cmds in README.md, BLEU # is evaluated based on text tokens, which is the standard metric. fname = os.path.join(FLAGS.model_dir, 'test.output') hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([id2w[y] for y in hyp]) rwords.append([id2w[y] for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_fn, ref_fn = tx.utils.write_paired_text(hwords, rwords, fname, mode='s') logger.info('Test output writtn to file: %s', hyp_fn) print('Test output writtn to file: %s' % hyp_fn)
def _eval_epoch(epoch, mode): torch.cuda.empty_cache() if mode == 'eval': eval_data = dev_data elif mode == 'test': eval_data = test_data else: raise ValueError("`mode` should be either \"eval\" or \"test\".") references, hypotheses = [], [] bsize = config_data.test_batch_size for i in tqdm(range(0, len(eval_data), bsize)): sources, targets = zip(*eval_data[i:i + bsize]) with torch.no_grad(): x_block = data_utils.source_pad_concat_convert( sources, device=device) predictions = model( encoder_input=x_block, is_train_mode=False, beam_width=beam_width) if beam_width == 1: decoded_ids = predictions[0].sample_id else: decoded_ids = predictions["sample_id"][:, :, 0] hypotheses.extend(h.tolist() for h in decoded_ids) references.extend(r.tolist() for r in targets) hypotheses = utils.list_strip_eos(hypotheses, eos_token_id) references = utils.list_strip_eos(references, eos_token_id) if mode == 'eval': # Writes results to files to evaluate BLEU # For 'eval' mode, the BLEU is based on token ids (rather than # text tokens) and serves only as a surrogate metric to monitor # the training process # TODO: Use texar.evals.bleu fname = os.path.join(args.model_dir, 'tmp.eval') hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([str(y) for y in hyp]) rwords.append([str(y) for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_fn, ref_fn = tx.utils.write_paired_text( hwords, rwords, fname, mode='s', src_fname_suffix='hyp', tgt_fname_suffix='ref') eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True) eval_bleu = 100. * eval_bleu logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu) print(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}") if eval_bleu > best_results['score']: logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu) best_results['score'] = eval_bleu best_results['epoch'] = epoch model_path = os.path.join(args.model_dir, args.model_fn) logger.info("Saving model to %s", model_path) print(f"Saving model to {model_path}") states = { 'model': model.state_dict(), 'optimizer': optim.state_dict(), 'scheduler': scheduler.state_dict(), } torch.save(states, model_path) elif mode == 'test': # For 'test' mode, together with the cmds in README.md, BLEU # is evaluated based on text tokens, which is the standard metric. fname = os.path.join(args.model_dir, 'test.output') hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([id2w[y] for y in hyp]) rwords.append([id2w[y] for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_fn, ref_fn = tx.utils.write_paired_text( hwords, rwords, fname, mode='s', src_fname_suffix='hyp', tgt_fname_suffix='ref') logger.info("Test output written to file: %s", hyp_fn) print(f"Test output written to file: {hyp_fn}")
def _test_epoch(cur_sess, cur_epoch, gamma_, lambda_g_, mode='test'): def _id2word_map(id_arrays): return [ ' '.join( [train_data.vocab._id_to_token_map_py[i] for i in sent]) for sent in id_arrays ] templates_list, targets_list, hypothesis_list = [], [], [] cnt = 0 loss_lists, ppl_lists = [], [] while True: try: fetches = { 'data_batch': data_batch, 'predictions': predictions, 'template': template_pack, 'step': global_step, 'loss': cetp_loss } feed = { iterator.handle: iterator.get_handle(sess, mode), gamma: gamma_, lambda_g: lambda_g_, tx.context.global_mode(): tf.estimator.ModeKeys.EVAL } rtns = cur_sess.run(fetches, feed_dict=feed) real_templates_, templates_, targets_, predictions_ = \ rtns['template']['templates'], rtns['template']['text_ids'], \ rtns['data_batch']['text_ids'], rtns['predictions'] loss = rtns['loss'] ppl = np.exp(loss) loss_lists.append(loss) ppl_lists.append(ppl) filled_templates = \ tx_utils.fill_template(template_pack=rtns['template'], predictions=rtns['predictions'], eoa_id=eoa_id, pad_id=pad_id, eos_id=eos_id) templates, targets, generateds = _id2word_map(real_templates_.tolist()), \ _id2word_map(targets_), \ _id2word_map(filled_templates) for template, target, generated in zip(templates, targets, generateds): template = template.split('<EOS>')[0].split( '<PAD>')[0].strip().split() target = target.split('<EOS>')[0].split( '<PAD>')[0].strip().split() got = generated.split('<EOS>')[0].split( '<PAD>')[0].strip().split() templates_list.append(template) targets_list.append(target) hypothesis_list.append(got) cnt += 1 if mode is not 'test' and cnt >= 60: break except tf.errors.OutOfRangeError: break avg_loss, avg_ppl = np.mean(loss_lists), np.mean(ppl_lists) outputs_tmp_filename = args.log_dir + 'epoch{}.beam{}.outputs.tmp'. \ format(cur_epoch, args.beam_width) template_tmp_filename = args.log_dir + 'epoch{}.beam{}.templates.tmp'. \ format(cur_epoch, args.beam_width) refer_tmp_filename = os.path.join(args.log_dir, 'eval_reference.tmp') with codecs.open(outputs_tmp_filename, 'w+', 'utf-8') as tmpfile, \ codecs.open(template_tmp_filename, 'w+', 'utf-8') as tmptpltfile, \ codecs.open(refer_tmp_filename, 'w+', 'utf-8') as tmpreffile: for hyp, tplt, tgt in zip(hypothesis_list, templates_list, targets_list): tmpfile.write(' '.join(hyp) + '\n') tmptpltfile.write(' '.join(tplt) + '\n') tmpreffile.write(' '.join(tgt) + '\n') eval_bleu = float(100 * bleu_tool.bleu_wrapper( refer_tmp_filename, outputs_tmp_filename, case_sensitive=True)) template_bleu = float(100 * bleu_tool.bleu_wrapper( refer_tmp_filename, template_tmp_filename, case_sensitive=True)) print('epoch:{} {}_bleu:{} template_bleu:{} {}_loss:{} {}_ppl:{} '. format(cur_epoch, mode, eval_bleu, template_bleu, mode, avg_loss, mode, avg_ppl)) os.remove(outputs_tmp_filename) os.remove(template_tmp_filename) os.remove(refer_tmp_filename) if args.save_eval_output: result_filename = \ args.log_dir + 'epoch{}.beam{}.{}.results.bleu{:.3f}' \ .format(cur_epoch, args.beam_width, mode, eval_bleu) with codecs.open(result_filename, 'w+', 'utf-8') as resultfile: for tmplt, tgt, hyp in zip(templates_list, targets_list, hypothesis_list): resultfile.write("- template: " + ' '.join(tmplt) + '\n') resultfile.write("- expected: " + ' '.join(tgt) + '\n') resultfile.write('- got: ' + ' '.join(hyp) + '\n\n') return {'eval': eval_bleu, 'template': template_bleu}, avg_ppl
def _eval_epoch(epoch, mode, print_fn=None): if print_fn is None: print_fn = print tqdm_leave = True else: tqdm_leave = False model.eval() eval_data = datasets[mode] eval_iter = tx.data.DataIterator(eval_data) references, hypotheses = [], [] for batch in tqdm.tqdm(eval_iter, ncols=120, leave=tqdm_leave, desc=f"Eval on {mode} set"): predictions = model( encoder_input=batch.source, beam_width=beam_width, ) if beam_width == 1: decoded_ids = predictions[0].sample_id else: decoded_ids = predictions["sample_id"][:, :, 0] hypotheses.extend(h.tolist() for h in decoded_ids) references.extend(r.tolist() for r in batch.target_output) hypotheses = utils.list_strip_eos(hypotheses, vocab.eos_token_id) references = utils.list_strip_eos(references, vocab.eos_token_id) if mode == "valid": # Writes results to files to evaluate BLEU # For 'eval' mode, the BLEU is based on token ids (rather than # text tokens) and serves only as a surrogate metric to monitor # the training process # TODO: Use texar.evals.bleu fname = os.path.join(args.model_dir, "tmp.eval") hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([str(y) for y in hyp]) rwords.append([str(y) for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_fn, ref_fn = tx.utils.write_paired_text( hwords, rwords, fname, mode="s", src_fname_suffix="hyp", tgt_fname_suffix="ref", ) eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True) eval_bleu = 100.0 * eval_bleu logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu) print_fn(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}") if eval_bleu > best_results["score"]: logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu) best_results["score"] = eval_bleu best_results["epoch"] = epoch model_path = os.path.join(args.model_dir, args.model_fn) logger.info("Saving model to %s", model_path) print_fn(f"Saving model to {model_path}") states = { "model": model.state_dict(), "optimizer": optim.state_dict(), "scheduler": scheduler.state_dict(), } torch.save(states, model_path) elif mode == "test": # For 'test' mode, together with the cmds in README.md, BLEU # is evaluated based on text tokens, which is the standard metric. fname = os.path.join(args.model_dir, "test.output") hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append(vocab.map_ids_to_tokens_py(hyp)) rwords.append(vocab.map_ids_to_tokens_py(ref)) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_fn, ref_fn = tx.utils.write_paired_text( hwords, rwords, fname, mode="s", src_fname_suffix="hyp", tgt_fname_suffix="ref", ) logger.info("Test output written to file: %s", hyp_fn) print_fn(f"Test output written to file: {hyp_fn}")