def betterAnswer(baseline, new, questionBody): infile_true = open("./input/BioASQ-trainingDataset5b.json", 'r') data_true = json.load(infile_true) for (i, question_i) in enumerate(data_true['questions']): if question_i['body'].strip() == questionBody.strip(): r = Rouge() manual_summmary = question_i['ideal_answer'][0] [precision_base, recall_base, f_score_base] = r.rouge_l([baseline], [manual_summmary]) [precision_new, recall_new, f_score_new] = r.rouge_l([new], [manual_summmary]) print "============================================" print "Ideal_answer \n" print manual_summmary print "Fused_answer %f %f \n" % (precision_new, recall_new) print new print "Baseline_answer %f %f \n" % (precision_base, recall_base) print baseline print "============================================" if f_score_base < f_score_new: print "11111" return new else: print "22222" return baseline return None
def read_dataset(): dataset = {} r = Rouge() threshold = .70 stories = get_dataset() for i in range(0, 2000): labeled_articles = set() story = stories[i]['story'] highlights = stories[i]['highlights'] for sent in story: for highlight in highlights: [precision, recall, f_score] = r.rouge_l(sent, highlight) if f_score > threshold: labeled_articles.add((sent, 'yes')) else: labeled_articles.add((sent, 'no')) dataset[i] = labeled_articles return dataset
from PyRouge.pyrouge import Rouge r = Rouge() system_generated_summary = "The Kyrgyz President pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections In an effort to live up to its reputation in the 1990s as an island of democracy. The use of ink is one part of a general effort to show commitment towards more open elections. improper use of this type of ink can cause additional problems as the elections in Afghanistan showed. The use of ink and readers by itself is not a panacea for election ills." manual_summmary = "The use of invisible ink and ultraviolet readers in the elections of the Kyrgyz Republic which is a small, mountainous state of the former Soviet republic, causing both worries and guarded optimism among different sectors of the population. Though the actual technology behind the ink is not complicated, the presence of ultraviolet light (of the kind used to verify money) causes the ink to glow with a neon yellow light. But, this use of the new technology has caused a lot of problems. " [precision, recall, f_score] = r.rouge_l([system_generated_summary], [manual_summmary]) print("Precision is :"+str(precision)+"\nRecall is :"+str(recall)+"\nF Score is :"+str(f_score))
from PyRouge.pyrouge import Rouge r = Rouge() # A simple eample of how rouge can be calculated #print r.rouge_l([[1, 7, 6, 7, 5], [0, 2, 8, 3, 5]], [[1, 2, 3, 4, 5], [3, 9, 5]]) # A more practical example of how it can be used for summary evaluation system_generated_summary = " The Kyrgyz President pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections In an effort to live up to its reputation in the 1990s as an island of democracy. The use of ink is one part of a general effort to show commitment towards more open elections. improper use of this type of ink can cause additional problems as the elections in Afghanistan showed. The use of ink and readers by itself is not a panacea for election ills." manual_summmary = " The use of invisible ink and ultraviolet readers in the elections of the Kyrgyz Republic which is a small, mountainous state of the former Soviet republic, causing both worries and guarded optimism among different sectors of the population. Though the actual technology behind the ink is not complicated, the presence of ultraviolet light (of the kind used to verify money) causes the ink to glow with a neon yellow light. But, this use of the new technology has caused a lot of problems. " #print r.rouge_l([system_generated_summary], [manual_summmary]) ACTUALABSTRACT = '<s> a native american from a tribe not recognized by the feds wins the return of his eagle feathers . </s> <s> an irs accountant is fired for insisting on carrying a symbolic sikh knife to work . </s> <s> a group of chicago pastors takes on city hall over its permits for new churches and loses . </s>' GENERATEDABSTRACT = '<go> <s> united states have been growing since the u.s. religious freedom restoration act . </s> <s> the united states have been growing since the u.s. religious freedom restoration act . </s> <s> new : `` there is reason to doubt whether these state-level religious protections '' </s> <end>' [precision, recall, f_score] = r.rouge_l([ACTUALABSTRACT], [GENERATEDABSTRACT]) print("Precision is :"+str(precision)+"\nRecall is :"+str(recall)+"\nF Score is :"+str(f_score))
if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']): newString=newString+reverse_target_word_index[i]+' ' return newString def seq2text(input_seq): newString='' for i in input_seq: if(i!=0): newString=newString+reverse_source_word_index[i]+' ' return newString precision_avg = 0.0 recall_avg = 0.0 fscore_avg = 0.0 for i in range(0,100): print("Review:",seq2text(x_tr[i])) print("Original summary:",seq2summary(y_tr[i])) print("Predicted summary:",decode_sequence(x_tr[i].reshape(1,max_text_len))) precision, recall, f_score] = r.rouge_l([decode_sequence(x_tr[i].reshape(1,max_text_len))],seq2summary(y_tr[i])) precision_avg = precision_avg + precision recall_avg = recall_avg + recall fscore_avg = fscore_avg + f_score print("Precision is :"+str(precision)+"\nRecall is :"+str(recall)+"\nF Score is :"+str(f_score)) print("\n") print("Overall precision is: ", precision_avg/100) print("Overall recall is: ",recall_avg/100) print("Overall fscore is: ",fscore_avg/100)
from PyRouge.pyrouge import Rouge r = Rouge() fptr1 = open('test.eval_titles.txt') fptr2 = open('eval_articles.1_300000.txt') system_summaries = fptr1.readlines() #.split() model_summaries = fptr2.readlines() #.split() avg_p = avg_r = avg_f1 = 0 for i in range(len(system_summaries)): [precision, recall, f_score] = r.rouge_l([system_summaries[i]], [model_summaries[i]]) avg_p += precision avg_r += recall avg_f1 += f_score print("Sentence:", i) print("Human:", system_summaries[i]) print("Model:", model_summaries[i]) print("Precision is :" + str(precision) + "\nRecall is :" + str(recall) + "\nF Score is :" + str(f_score)) print() print("----------------------Final eval-------------------") print("Precision:", (float)(avg_p / len(system_summaries))) print("Recall:", (float)(avg_r / len(system_summaries))) print("F1-score:", (float)(avg_f1 / len(system_summaries)))
def main(args): # Setting warnings.simplefilter("ignore", UserWarning) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Args Parser hj_method = args.hj_method kr_method = args.kr_method batch_size = args.batch_size beam_size = args.beam_size hidden_size = args.hidden_size embed_size = args.embed_size vocab_size = args.vocab_size max_len = args.max_len padding_index = args.pad_id n_layers = args.n_layers stop_ix = args.stop_ix # Load saved model & Word2vec save_path = 'save_{}_{}_{}_maxlen_{}'.format(vocab_size, hj_method, kr_method, max_len) save_list = sorted(glob.glob(f'./save/{save_path}/*.*')) save_pt = save_list[-1] print('Will load {} pt file...'.format(save_pt)) word2vec_hj = Word2Vec.load('./w2v/word2vec_hj_{}_{}.model'.format( vocab_size, hj_method)) # SentencePiece model load spm_kr = spm.SentencePieceProcessor() spm_kr.Load("./spm/m_korean_{}.model".format(vocab_size)) # Test data load with open('./test_dat.pkl', 'rb') as f: test_dat = pickle.load(f) test_dataset = CustomDataset(test_dat['test_hanja'], test_dat['test_korean']) test_loader = getDataLoader(test_dataset, pad_index=padding_index, shuffle=False, batch_size=batch_size) # Model load print('Model loading...') encoder = Encoder(vocab_size, embed_size, hidden_size, word2vec_hj, n_layers=n_layers, padding_index=padding_index) decoder = Decoder(embed_size, hidden_size, vocab_size, n_layers=n_layers, padding_index=padding_index) seq2seq = Seq2Seq(encoder, decoder, beam_size).cuda() #optimizer = optim.Adam(seq2seq.parameters(), lr=lr, weight_decay=w_decay) #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=scheduler_step_size, gamma=lr_decay) print(seq2seq) print('Testing...') start_time = time.time() results = test(seq2seq, test_loader, vocab_size, load_pt=save_pt, stop_ix=stop_ix) print(time.time() - start_time) print('Done!') print("Decoding...") pred_list = list() for result_text in tqdm(results): text = torch.Tensor(result_text).squeeze().tolist() text = [int(x) for x in text] prediction_sentence = spm_kr.decode_ids( text).strip() # Decode with strip pred_list.append(prediction_sentence) ref_list = list() for ref_text in tqdm(test_dat['test_korean'][:stop_ix]): ref_list.append(spm_kr.decode_ids(ref_text).strip()) print('Done!') with open(f'./save/{save_path}/test_result.pkl', 'wb') as f: pickle.dump({ 'pred': pred_list, 'reference': ref_list, }, f) print('Save file; /test_dat.pkl') # Calculate BLEU Score print('Calculate BLEU4, METEOR, Rogue-L...') chencherry = SmoothingFunction() bleu4 = corpus_bleu(test_dat['reference'], test_dat['pred'], smoothing_function=chencherry.method4) print('BLEU Score is {}'.format(bleu4)) # Calculate METEOR Score meteor = meteor_score(test_dat['reference'], test_dat['pred']) print('METEOR Score is {}'.format(meteor)) # Calculate Rouge-L Score r = Rouge() total_test_length = len(test_dat['reference']) precision_all = 0 recall_all = 0 f_score_all = 0 for i in range(total_test_length): [precision, recall, f_score] = r.rouge_l([test_dat['reference'][i]], [test_dat['pred'][i]]) precision_all += precision recall_all += recall f_score_all += f_score print('Precision : {}'.foramt(round(precision_all / total_test_length, 4))) print('Recall : {}'.foramt(round(recall_all / total_test_length, 4))) print('F Score : {}'.foramt(round(f_score_all / total_test_length, 4)))
from PyRouge.pyrouge import Rouge r = Rouge() # A simple eample of how rouge can be calculated print r.rouge_l([[1, 7, 6, 7, 5], [0, 2, 8, 3, 5]], [[1, 2, 3, 4, 5], [3, 9, 5]]) # A more practical example of how it can be used for summary evaluation system_generated_summary = " The Kyrgyz President pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections In an effort to live up to its reputation in the 1990s as an island of democracy. The use of ink is one part of a general effort to show commitment towards more open elections. improper use of this type of ink can cause additional problems as the elections in Afghanistan showed. The use of ink and readers by itself is not a panacea for election ills." manual_summmary = " The use of invisible ink and ultraviolet readers in the elections of the Kyrgyz Republic which is a small, mountainous state of the former Soviet republic, causing both worries and guarded optimism among different sectors of the population. Though the actual technology behind the ink is not complicated, the presence of ultraviolet light (of the kind used to verify money) causes the ink to glow with a neon yellow light. But, this use of the new technology has caused a lot of problems. " print r.rouge_l([system_generated_summary], [manual_summmary])
ctr_side_2 = list(ngrams(ctr1_tokens, 2)) + list(ngrams( ctr2_tokens, 2)) + list(ngrams(ctr3_tokens, 2)) inc_neg_side_2 = list(ngrams(inconst_neg1_tokens, 2)) + list( ngrams(inconst_neg2_tokens, 2)) + list(ngrams(inconst_neg3_tokens, 2)) #rouge2_rand_list.append(rouge_one(rand_side_2, gold_side_2)) rouge2_lead_list.append(rouge_one(lead_side_2, gold_side_2)) rouge2_hybrid1_list.append(rouge_one(hybrid_side_2, gold_side_2)) rouge2_inconst_list.append(rouge_one(inconst_side_2, gold_side_2)) rouge2_neg_list.append(rouge_one(neg_side_2, gold_side_2)) rouge2_ctr_list.append(rouge_one(ctr_side_2, gold_side_2)) rouge2_inconst_neg_list.append(rouge_one(inc_neg_side_2, gold_side_2)) # 1 : X.AUTHID, 2 : spec_domain, 3: site.content 4-6: GOLD, 7-9: LEAD, 10-12: Hybrid, 13-15: Incons, 16-18: Neg, 19-21: Ctr, 22-24: InconsNeg rouge_lcs_lead_list.append( r_summ_evaluate.rouge_l(" ".join(summ_data.iloc[row_ind, 3:6]), " ".join(summ_data.iloc[row_ind, 6:9]))) rouge_lcs_hybrid1_list.append( r_summ_evaluate.rouge_l(" ".join(summ_data.iloc[row_ind, 3:6]), " ".join(summ_data.iloc[row_ind, 9:12]))) #rouge_lcs_rand_list.append(r_summ_evaluate.rouge_l(" ".join(summ_data.iloc[row_ind, 3:6]), " ".join(summ_data.iloc[row_ind, 9:12]))) rouge_lcs_inconst_list.append( r_summ_evaluate.rouge_l(" ".join(summ_data.iloc[row_ind, 3:6]), " ".join(summ_data.iloc[row_ind, 12:15]))) rouge_lcs_neg_list.append( r_summ_evaluate.rouge_l(" ".join(summ_data.iloc[row_ind, 3:6]), " ".join(summ_data.iloc[row_ind, 15:18]))) rouge_lcs_ctr_list.append( r_summ_evaluate.rouge_l(" ".join(summ_data.iloc[row_ind, 3:6]), " ".join(summ_data.iloc[row_ind, 18:21]))) rouge_lcs_inconst_neg_list.append( r_summ_evaluate.rouge_l(" ".join(summ_data.iloc[row_ind, 3:6]),
def rouge_l(S, I): r = Rouge() [precision, recall, f_score] = r.rouge_l([S], [I]) return f_score