def eval_meteor_file(run_file, ref_file, tokenizer=None, detokenizer=None): run_dict = {} with codecs.open(run_file, encoding='utf-8') as f: for line in f: temp = line.strip('\n').strip('\r').split('\t', 3) assert len(temp) == 4 run_dict[temp[1]+'##<>##'+temp[2]] = temp[3] ref_dict = {} with codecs.open(ref_file, encoding='utf-8') as f: for line in f: temp = line.strip('\n').strip('\r').split('\t', 3) assert len(temp) == 4 tokenized = temp[3] if tokenizer is not None: tokenized = detokenizer(tokenizer(temp[3])) if temp[1] in ref_dict: ref_dict[temp[1]].append(tokenized) else: ref_dict[temp[1]] = [tokenized] meteor = 0. for id in run_dict: # [text1(,text2)] vs text meteor += meteor_score(ref_dict[id.split('##<>##')[0]], run_dict[id]) return {'METEOR': rounder(meteor*100/len(run_dict))}
def batch_meteor(comments, predicts, nl_i2w): references, hypothesises = batch_evaluate(comments, predicts, nl_i2w) scores = [] for i in range(len(references)): if len(hypothesises[i]) == 0: scores.append(0) else: _, _, _, score = meteor_score(' '.join(hypothesises[i]), ' '.join(references[i])) scores.append(score) return scores
def evaluate_on_file(path): files = os.listdir(path) for file in files: if file.startswith('.D') or file.endswith('.txt'): continue writer = SummaryWriter(logdir='test_log/' + file.split('.')[0] + '/') file_path = path + file data = load_json(file_path) node_len_bleu = {i: [] for i in range(21)} com_len_bleu = {i: [] for i in range(31)} bleu = [] rouge_all = [] meteor_all = [] for i, d in enumerate(data): node_len = min(int(d['node_len']), 100) node_len = int(node_len / 5) predict = d['predict'].strip().split() true = d['true'].split() com_len = len(true) if com_len > 30: com_len = 30 if len(predict) <= 1 or com_len < 4: score = 0 else: score = sentence_bleu( [true], predict, smoothing_function=SmoothingFunction().method4) _, _, rouge_s = rouge_l_score(d['predict'], d['true']) meteor_s = meteor_score(d['predict'], d['true']) node_len_bleu[node_len].append(score) com_len_bleu[com_len].append(score) bleu.append(score) rouge_all.append(rouge_s) meteor_all.append(meteor_s) if file not in ['transformer_seq.json']: for key, value in node_len_bleu.items(): if len(value) == 0 or key == 0: continue score = np.mean(value) writer.add_scalar('node_len_bleu', score, (key) * 5) for key, value in com_len_bleu.items(): if len(value) == 0: continue score = np.mean(value) if score == 0: continue writer.add_scalar('com_len_bleu', score, key) writer.close() print( file.split('.')[0] + ': bleu:' + round_3(bleu) + ', rouge:' + round_3(rouge_all) + ', meteor:' + round_3(meteor_all))
def main(args): # Setting warnings.simplefilter("ignore", UserWarning) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Args Parser hj_method = args.hj_method kr_method = args.kr_method batch_size = args.batch_size beam_size = args.beam_size hidden_size = args.hidden_size embed_size = args.embed_size vocab_size = args.vocab_size max_len = args.max_len padding_index = args.pad_id n_layers = args.n_layers stop_ix = args.stop_ix # Load saved model & Word2vec save_path = 'save_{}_{}_{}_maxlen_{}'.format(vocab_size, hj_method, kr_method, max_len) save_list = sorted(glob.glob(f'./save/{save_path}/*.*')) save_pt = save_list[-1] print('Will load {} pt file...'.format(save_pt)) word2vec_hj = Word2Vec.load('./w2v/word2vec_hj_{}_{}.model'.format( vocab_size, hj_method)) # SentencePiece model load spm_kr = spm.SentencePieceProcessor() spm_kr.Load("./spm/m_korean_{}.model".format(vocab_size)) # Test data load with open('./test_dat.pkl', 'rb') as f: test_dat = pickle.load(f) test_dataset = CustomDataset(test_dat['test_hanja'], test_dat['test_korean']) test_loader = getDataLoader(test_dataset, pad_index=padding_index, shuffle=False, batch_size=batch_size) # Model load print('Model loading...') encoder = Encoder(vocab_size, embed_size, hidden_size, word2vec_hj, n_layers=n_layers, padding_index=padding_index) decoder = Decoder(embed_size, hidden_size, vocab_size, n_layers=n_layers, padding_index=padding_index) seq2seq = Seq2Seq(encoder, decoder, beam_size).cuda() #optimizer = optim.Adam(seq2seq.parameters(), lr=lr, weight_decay=w_decay) #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=scheduler_step_size, gamma=lr_decay) print(seq2seq) print('Testing...') start_time = time.time() results = test(seq2seq, test_loader, vocab_size, load_pt=save_pt, stop_ix=stop_ix) print(time.time() - start_time) print('Done!') print("Decoding...") pred_list = list() for result_text in tqdm(results): text = torch.Tensor(result_text).squeeze().tolist() text = [int(x) for x in text] prediction_sentence = spm_kr.decode_ids( text).strip() # Decode with strip pred_list.append(prediction_sentence) ref_list = list() for ref_text in tqdm(test_dat['test_korean'][:stop_ix]): ref_list.append(spm_kr.decode_ids(ref_text).strip()) print('Done!') with open(f'./save/{save_path}/test_result.pkl', 'wb') as f: pickle.dump({ 'pred': pred_list, 'reference': ref_list, }, f) print('Save file; /test_dat.pkl') # Calculate BLEU Score print('Calculate BLEU4, METEOR, Rogue-L...') chencherry = SmoothingFunction() bleu4 = corpus_bleu(test_dat['reference'], test_dat['pred'], smoothing_function=chencherry.method4) print('BLEU Score is {}'.format(bleu4)) # Calculate METEOR Score meteor = meteor_score(test_dat['reference'], test_dat['pred']) print('METEOR Score is {}'.format(meteor)) # Calculate Rouge-L Score r = Rouge() total_test_length = len(test_dat['reference']) precision_all = 0 recall_all = 0 f_score_all = 0 for i in range(total_test_length): [precision, recall, f_score] = r.rouge_l([test_dat['reference'][i]], [test_dat['pred'][i]]) precision_all += precision recall_all += recall f_score_all += f_score print('Precision : {}'.foramt(round(precision_all / total_test_length, 4))) print('Recall : {}'.foramt(round(recall_all / total_test_length, 4))) print('F Score : {}'.foramt(round(f_score_all / total_test_length, 4)))