def evaluate(model, data, ty='valid', max_dec_step=50): model.__id__logger = 0 dial = [] ref, hyp_g= [],[] if ty=="test": print("testing generation:") #t = Translator(model, model.vocab) l = [] p = [] kld = [] bow = [] elbo = [] pbar = tqdm(enumerate(data),total=len(data)) for j, batch in pbar: loss, ppl, kld_prog, bow_prog, elbo_prog = model.train_one_batch(batch, 0, train=False) l.append(loss) p.append(ppl) kld.append(kld_prog) bow.append(bow_prog) elbo.append(elbo_prog) if(ty =="test" or (ty =="valid" and j< 3)): sent_g = model.decoder_greedy(batch,max_dec_step=max_dec_step) for i, greedy_sent in enumerate(sent_g): rf = " ".join(batch["target_txt"][i]) hyp_g.append(greedy_sent) ref.append(rf) print_custum(emotion= batch["program_txt"][i], dial=[" ".join(s) for s in batch['input_txt'][i]] if config.dataset=="empathetic" else " ".join(batch['input_txt'][i]), ref=rf, hyp_g=greedy_sent) else: continue pbar.set_description("loss:{:.4f} ppl:{:.1f}".format(np.mean(l),math.exp(np.mean(l)))) loss = np.mean(l) ppl = np.mean(p) kld = np.mean(kld) bow = np.mean(bow) elbo = np.mean(elbo) bleu_score_g = moses_multi_bleu(np.array(hyp_g), np.array(ref), lowercase=True) # bleu_score_b = moses_multi_bleu(np.array(hyp_b), np.array(ref), lowercase=True) gd1,gd2,gd3 = distinct_k(hyp_g) rd1,rd2,rd3 = distinct_k(ref) print("rd1:{},rd2:{},rd3:{}".format(rd1,rd2,rd3)) print("EVAL\tLoss\tPPL\tBleu_g\td1\td2\td3") print("{}\t{:.4f}\t{:.4f}\t{:.2f}\t{:.4f}\t{:.4f}\t{:.4f}".format(ty,loss,math.exp(loss), bleu_score_g, gd1,gd2,gd3)) return loss, math.exp(loss), kld, bow, elbo, bleu_score_g, gd1,gd2,gd3
def evaluate(model, data, ty='valid', max_dec_step=30): model.__id__logger = 0 dial = [] ref, hyp_g, hyp_b, hyp_t = [], [], [], [] if ty == "test": print("testing generation:") t = Translator(model, model.vocab) l = [] p = [] bce = [] acc = [] pbar = tqdm(enumerate(data), total=len(data)) for j, batch in pbar: loss, ppl, bce_prog, acc_prog = model.train_one_batch(batch, 0, train=False) l.append(loss) p.append(ppl) bce.append(bce_prog) acc.append(acc_prog) if (ty == "test"): sent_g = model.decoder_greedy(batch, max_dec_step=max_dec_step) sent_b = t.beam_search(batch, max_dec_step=max_dec_step) #sent_t = model.decoder_topk(batch, max_dec_step=max_dec_step) for i, (greedy_sent, beam_sent) in enumerate(zip(sent_g, sent_b)): rf = " ".join(batch["target_txt"][i]) hyp_g.append(greedy_sent) hyp_b.append(beam_sent) #hyp_t.append(topk_sent) ref.append(rf) print_custum( emotion=batch["program_txt"][i], dial=[" ".join(s) for s in batch['input_txt'][i]] if config.dataset == "empathetic" else " ".join(batch['input_txt'][i]), ref=rf, #hyp_t=topk_sent, hyp_g=greedy_sent, hyp_b=beam_sent) pbar.set_description("loss:{:.4f} ppl:{:.1f}".format( np.mean(l), math.exp(np.mean(l)))) loss = np.mean(l) ppl = np.mean(p) bce = np.mean(bce) acc = np.mean(acc) bleu_score_g = moses_multi_bleu(np.array(hyp_g), np.array(ref), lowercase=True) bleu_score_b = moses_multi_bleu(np.array(hyp_b), np.array(ref), lowercase=True) #bleu_score_t = moses_multi_bleu(np.array(hyp_t), np.array(ref), lowercase=True) print("EVAL\tLoss\tPPL\tAccuracy\tBleu_g\tBleu_b") print("{}\t{:.4f}\t{:.4f}\t{:.2f}\t{:.2f}\t{:.2f}".format( ty, loss, math.exp(loss), acc, bleu_score_g, bleu_score_b)) return loss, math.exp(loss), bce, acc, bleu_score_g, bleu_score_b
def evaluate_transformer(model, data, model_name='trs', ty='valid', writer=None, n_iter=0, ty_eval="before", verbose=False, log=False, result_file="results/results_transformer.txt", ref_file="results/ref_transformer.txt", case_file="results/case_transformer.txt"): if log: f1 = open(result_file, "a") f2 = open(ref_file, "a") dial, ref, hyp_b, per = [], [], [], [] t = TrsTranslator(model, model.vocab) l = [] p = [] ent_b = [] pbar = tqdm(enumerate(data), total=len(data)) for j, batch in pbar: #print(len(batch["input_batch"])) #print(len(batch["target_batch"])) torch.cuda.empty_cache() loss, ppl, _ = model.train_one_batch(batch, train=False) l.append(loss) p.append(ppl) if ((j < 3 and ty != "test") or ty == "test"): sent_b, _ = t.translate_batch(batch) for i in range(len(batch["target_txt"])): new_words = [] for w in sent_b[i][0]: if w == config.EOS_idx: break new_words.append(w) if len(new_words) > 2 and (new_words[-2] == w): new_words.pop() sent_beam_search = ' '.join( [model.vocab.index2word[idx] for idx in new_words]) hyp_b.append(sent_beam_search) if log: f1.write(sent_beam_search) f1.write("\n") ref.append(batch["target_txt"][i]) if log: f2.write(batch["target_txt"][i]) f2.write("\n") dial.append(batch['input_txt'][i]) per.append(batch['persona_txt'][i]) ent_b.append(0.0) #ent_b.append(bert.predict_label([sent_beam_search for _ in range(len(batch['persona_txt'][i]))], batch['persona_txt'][i])) pbar.set_description("loss:{:.4f} ppl:{:.1f}".format( np.mean(l), np.mean(p))) torch.cuda.empty_cache() if (j > 4 and ty == "train"): break loss = np.mean(l) ppl = np.mean(p) ent_b = np.mean(ent_b) bleu_score_b = moses_multi_bleu(np.array(hyp_b), np.array(ref), lowercase=True) #bleu_score_b = moses_multi_bleu(np.array(hyp_b), np.array(hyp_b), lowercase=True) #bleu_score_b = get_bleu(np.array(hyp_b), np.array(ref)) if log: f1.close() f2.close() if (verbose): print( "----------------------------------------------------------------------" ) print( "----------------------------------------------------------------------" ) print_all(dial, ref, hyp_b, max_print=3 if ty != "test" else 100) print("EVAL\tLoss\tPeplexity\tEntl_b\tBleu_b") print("{}\t{:.4f}\t{:.4f}\t{:.2f}\t{:.2f}".format( ty, loss, ppl, ent_b, bleu_score_b)) if log: log_all(dial, ref, hyp_b, per, case_file) return loss, ppl, ent_b, bleu_score_b
def evaluate(model, data, ty='valid', max_dec_step=30, write_summary=False): emo_map = { 'surprised': 0, 'excited': 1, 'annoyed': 2, 'proud': 3, 'angry': 4, 'sad': 5, 'grateful': 6, 'lonely': 7, 'impressed': 8, 'afraid': 9, 'disgusted': 10, 'confident': 11, 'terrified': 12, 'hopeful': 13, 'anxious': 14, 'disappointed': 15, 'joyful': 16, 'prepared': 17, 'guilty': 18, 'furious': 19, 'nostalgic': 20, 'jealous': 21, 'anticipating': 22, 'embarrassed': 23, 'content': 24, 'devastated': 25, 'sentimental': 26, 'caring': 27, 'trusting': 28, 'ashamed': 29, 'apprehensive': 30, 'faithful': 31} emo_map = {v:k for k,v in emo_map.items()} model.__id__logger = 0 dial = [] ref, hyp_g, hyp_b, hyp_t = [],[],[],[] if ty=="test": print("testing generation:") t = Translator(model, model.vocab) l = [] p = [] bce = [] acc = [] pbar = tqdm(enumerate(data),total=len(data)) inf_results = [] try: for j, batch in pbar: loss, ppl, bce_prog, acc_prog = model.train_one_batch(batch, 0, train=False) l.append(loss) p.append(ppl) bce.append(bce_prog) acc.append(acc_prog) if(ty =="test"): sent_g, vader_score, emotion_id = model.decoder_greedy(batch,max_dec_step=max_dec_step) sent_b = t.beam_search(batch, max_dec_step=max_dec_step) sent_t = model.decoder_topk(batch, max_dec_step=max_dec_step) for i, (greedy_sent, beam_sent, topk_sent) in enumerate(zip(sent_g, sent_b, sent_t)): rf = " ".join(batch["target_txt"][i]) hyp_g.append(greedy_sent) hyp_b.append(beam_sent) hyp_t.append(topk_sent) ref.append(rf) # print_custom temp = write_custum(emotion= batch["program_txt"][i], vader_score=vader_score, emo=emo_map[emotion_id], dial=[" ".join(s) for s in batch['input_txt'][i]] if config.dataset=="empathetic" else " ".join(batch['input_txt'][i]), ref=rf, hyp_t=topk_sent, hyp_g=greedy_sent, hyp_b=beam_sent) inf_results.append(temp) pbar.set_description("loss:{:.4f} ppl:{:.1f}".format(np.mean(l),math.exp(np.mean(l)))) except KeyboardInterrupt: print("Only testing for a fraction of testing dataset, do not use this result!") loss = np.mean(l) ppl = np.mean(p) bce = np.mean(bce) acc = np.mean(acc) bleu_score_g = moses_multi_bleu(np.array(hyp_g), np.array(ref), lowercase=True) bleu_score_b = moses_multi_bleu(np.array(hyp_b), np.array(ref), lowercase=True) bleu_score_t = moses_multi_bleu(np.array(hyp_t), np.array(ref), lowercase=True) print("EVAL\tLoss\tPPL\tAccuracy\tBleu_g\tBleu_b\tBlue_t") print("{}\t{:.4f}\t{:.4f}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.2f}".format(ty,loss,math.exp(loss), acc, bleu_score_g,bleu_score_b, bleu_score_t)) if write_summary: return loss, math.exp(loss), bce, acc, bleu_score_g, bleu_score_b, bleu_score_t, inf_results else: return loss, math.exp(loss), bce, acc, bleu_score_g, bleu_score_b, bleu_score_t
def evaluate(model, data, ty='valid', max_dec_step=30, save=False): emotion_lst, batch_lst, ref, hyp_g, hyp_b = [], [], [], [], [] if ty == "test": print("testing generation:") t = Translator(model, model.vocab) l = [] p = [] bce = [] acc = [] pbar = tqdm(enumerate(data), total=len(data)) for j, batch in pbar: loss, ppl, bce_prog, acc_prog = model.train_one_batch(batch, train=False) l.append(loss) p.append(ppl) bce.append(bce_prog) acc.append(acc_prog) if ty == "test": sent_g = model.decoder_greedy(batch, max_dec_step=max_dec_step) sent_b = t.beam_search(batch, max_dec_step=max_dec_step) for i, (greedy_sent, beam_sent) in enumerate(zip(sent_g, sent_b)): emotion_lst.append(batch["program_txt"][i]) batch_lst.append([" ".join(s) for s in batch['input_txt'][i]] if config. dataset == "empathetic_dialogues" else " ". join(batch['input_txt'][i])) rf = " ".join( [ele for lis in batch["target_txt"][i] for ele in lis] if config.dataset == "empathetic_dialogues" else batch['target_txt'][i]) hyp_g.append(greedy_sent) hyp_b.append(beam_sent) ref.append(rf) print_custum(emotion=batch["program_txt"][i], dial=[" ".join(s) for s in batch['input_txt'][i]] if config.dataset == "empathetic_dialogues" else " ".join(batch['input_txt'][i]), ref=rf, hyp_g=greedy_sent, hyp_b=beam_sent) pbar.set_description("loss:{:.4f} ppl:{:.1f}".format( np.mean(l), math.exp(np.mean(l)))) if ty == "test" and save: emotion_lst = pd.DataFrame(emotion_lst) batch_lst = pd.DataFrame(batch_lst) hyp_g_pd = pd.DataFrame(hyp_g) hyp_b_pd = pd.DataFrame(hyp_b) ref_pd = pd.DataFrame(ref) if not os.path.exists(config.save_path + 'test/'): os.mkdir(config.save_path + 'test/') emotion_lst.to_csv(config.save_path + 'test/emotions.csv', index=False, header=False) batch_lst.to_csv(config.save_path + 'test/batch.csv', index=False, header=False) hyp_g_pd.to_csv(config.save_path + 'test/reply_greedy.csv', index=False, header=False) hyp_b_pd.to_csv(config.save_path + 'test/reply_beam.csv', index=False, header=False) ref_pd.to_csv(config.save_path + 'test/reply_true.csv', index=False, header=False) loss = np.mean(l) bce = np.mean(bce) acc = np.mean(acc) bleu_score_g = moses_multi_bleu(np.array(hyp_g), np.array(ref), lowercase=True) bleu_score_b = moses_multi_bleu(np.array(hyp_b), np.array(ref), lowercase=True) rouge_score_g = rouge(np.array(hyp_g), np.array(ref)) rouge_score_b = rouge(np.array(hyp_b), np.array(ref)) print("EVAL\tLoss\tPPL\tAccuracy\tBleu_g\tBleu_b\tROUGE_g\tROUGE_b") print("{}\t{:.4f}\t{:.4f}\t{:.2f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}".format( ty, loss, math.exp(loss), acc, bleu_score_g, bleu_score_b, rouge_score_g, rouge_score_b)) return loss, math.exp( loss ), bce, acc, bleu_score_g, bleu_score_b, rouge_score_g, rouge_score_b