Ejemplo n.º 1
0
def generation_metrics(hypothesis_list, reference_list):
    rouge = Rouge()
    rouge_scores = rouge.get_scores(
        hyps=list(reduce(lambda x,y:x+y,list(zip(*([hypothesis_list]*len(reference_list)))))),
        refs=reference_list * len(hypothesis_list),
        avg=True)
    bleu_scores = {}
    for i in range(1,6):
        bleu_scores['bleu%s'%i] = bleu_score.corpus_bleu(
            list_of_references=[reference_list]*len(hypothesis_list),
            hypotheses=hypothesis_list,
            weights=[1.0/i]*i)
    return rouge_scores, bleu_scores
Ejemplo n.º 2
0
def rouge_eval(hyps, refer):
    rouge = Rouge()
    # print(hyps)
    # print(refer)
    # print(rouge.get_scores(hyps, refer))
    try:
        score = rouge.get_scores(hyps, refer)[0]
        mean_score = np.mean([
            score["rouge-1"]["f"], score["rouge-2"]["f"], score["rouge-l"]["f"]
        ])
    except:
        mean_score = 0.0
    return mean_score
Ejemplo n.º 3
0
def get_rouge_scores(generated_summary, ref_summary):
    rouge = Rouge()
    r_scores = rouge.get_scores(generated_summary, ref_summary)

    scores = {}
    for rouge, outputs in r_scores[0].items():
        temp_output = {}
        for output, value in outputs.items():
            temp_output[output] = round(value, 3)

        scores[rouge] = temp_output

    return scores
Ejemplo n.º 4
0
def score(ref, hypo):
    scorer = Rouge()
    method = "ROUGE_L"

    final_scores = {}

    score, scores = scorer.compute_score(ref, hypo)
    if type(score) == list:
        for m, s in zip(score, score):
            final_scores[m] = s
    else:
        final_scores[method] = score
    return final_scores
Ejemplo n.º 5
0
 def predict(self, test_data, beam_search=True):
     """
     :param test_data: test data loader
     :param beam_search: whether use beam search, otherwise use greedy search
     :return:
     """
     rouge = Rouge()
     f1_r1, f1_r2, f1_rl = [], [], []
     print("生成摘要...")
     sleep(0.5)
     self.model.eval()
     count = 0
     with torch.no_grad():
         # batch_sz = 1
         for batch_data in tqdm(test_data):
             # (batch_sz, src_len), (batch_sz, src_len, 1), (batch_sz, tgt_len)
             src_vec, src_mask, tgt_vec = [
                 data.to(self.device) for data in batch_data[:-1]
             ]
             oov_dict = batch_data[-1]
             # (batch_sz, src_len, 2*hidden_sz), (batch_sz, 2*hidden_sz)
             enc_output, enc_hidden = self.model.encoder(
                 replace_oov(src_vec, self.vocab, self.device))
             # (2, batch_sz, hidden_sz) -> (1, batch_sz, hidden_sz)
             context_vec = self.model.reducer(enc_hidden)
             # (batch_sz, )
             dec_input = tgt_vec[:, 0].to(self.device)
             # (batch_sz, src_len, 1)
             coverage_vec = torch.zeros_like(src_vec,
                                             dtype=torch.float).unsqueeze(2)
             if beam_search:
                 summary = self.beam_search(dec_input, context_vec,
                                            coverage_vec, enc_output,
                                            src_mask, src_vec, oov_dict,
                                            self.beam_width)
             else:
                 summary = self.beam_search(dec_input, context_vec,
                                            coverage_vec, enc_output,
                                            src_mask, src_vec, oov_dict, 1)
             sentence = id2output(summary, self.vocab, oov_dict[0])
             ref = id2output(tgt_vec.squeeze().cpu().tolist(), self.vocab,
                             oov_dict[0])
             score = rouge.get_scores(sentence, ref)[0]
             f1_r1.append(score['rouge-1']['f'])
             f1_r2.append(score['rouge-2']['f'])
             f1_rl.append(score['rouge-l']['f'])
             if (count + 1) % 100 == 0:
                 print('\r', ref)
                 print(sentence, '\n')
             count += 1
     return map(np.mean, (f1_r1, f1_r2, f1_rl))
Ejemplo n.º 6
0
    def _eval_epoch(sess, mode, epoch_no):
        """
        This function is the same as _eval_epoch() in
        baseline_seq2seq_attn_main.py.
        """
        if mode == 'val':
            data_iterator.switch_to_val_data(sess)
        else:
            data_iterator.switch_to_test_data(sess)

        refs, hypos = [], []
        while True:
            try:
                fetches = [
                    batch['target_text'][:, 1:],
                    infer_outputs.predicted_ids[:, :, 0]
                ]
                feed_dict = {tx.global_mode(): tf.estimator.ModeKeys.EVAL}
                target_texts_ori, output_ids = \
                    sess.run(fetches, feed_dict=feed_dict)

                target_texts = tx.utils.strip_special_tokens(
                    target_texts_ori.tolist(), is_token_list=True)
                target_texts = tx.utils.str_join(target_texts)
                output_texts = tx.utils.map_ids_to_strs(
                    ids=output_ids, vocab=val_data.target_vocab)

                tx.utils.write_paired_text(target_texts,
                                           output_texts,
                                           log_dir + mode + '_results' +
                                           str(epoch_no) + '.txt',
                                           append=True,
                                           mode='h',
                                           sep=' ||| ')

                for hypo, ref in zip(output_texts, target_texts):
                    if config_data.eval_metric == 'bleu':
                        hypos.append(hypo)
                        refs.append([ref])
                    elif config_data.eval_metric == 'rouge':
                        hypos.append(tx.utils.compat_as_text(hypo))
                        refs.append(tx.utils.compat_as_text(ref))
            except tf.errors.OutOfRangeError:
                break

        if config_data.eval_metric == 'bleu':
            return tx.evals.corpus_bleu_moses(list_of_references=refs,
                                              hypotheses=hypos)
        elif config_data.eval_metric == 'rouge':
            rouge = Rouge()
            return rouge.get_scores(hyps=hypos, refs=refs, avg=True)
Ejemplo n.º 7
0
def main():
    hyp = 'hyp'
    raw_ref = 'abstracts'
    FJoin = os.path.join
    files_hyp = [FJoin(hyp, f) for f in os.listdir(hyp)]
    files_raw_ref = [FJoin(raw_ref, f) for f in os.listdir(hyp)]
    
    f_hyp = []
    f_raw_ref = []
    print("number of document: ", len(files_hyp))
    for file in files_hyp:
        f = open(file)
        f_hyp.append(f.read())
        f.close()
    for file in files_raw_ref:
        f = open(file)
        f_raw_ref.append(f.read())
        f.close()
        
    rouge_1_tmp = []
    rouge_2_tmp = []
    rouge_L_tmp = []
    number = 1
    for hyp, ref in zip(f_hyp, f_raw_ref):
        try:
            rouge = Rouge()
            scores = rouge.get_scores(hyp, ref, avg=True)
            # rouge_1 = scores["rouge-1"]["r"]
            # rouge_2 = scores["rouge-2"]["r"]
            # rouge_L = scores["rouge-l"]["r"]
            rouge_1 = scores["rouge-1"]["f"]
            rouge_2 = scores["rouge-2"]["f"]
            rouge_L = scores["rouge-l"]["f"]
            rouge_1_tmp.append(rouge_1)

            rouge_2_tmp.append(rouge_2)
            rouge_L_tmp.append(rouge_L)
            print(scores)
        except Exception:
            pass
    rouge_1_avg = sta.mean(rouge_1_tmp)
    rouge_2_avg = sta.mean(rouge_2_tmp)
    rouge_L_avg = sta.mean(rouge_L_tmp)
    print("total file : " , len(f_hyp))
    print(len(rouge_1_tmp))
    print('Rouge-1')
    print(rouge_1_avg)
    print('Rouge-2')
    print(rouge_2_avg)
    print('Rouge-L')
    print(rouge_L_avg)
Ejemplo n.º 8
0
def compute_rouge_python(cand, ref, is_input_files=False, language="en"):
    """
    Computes ROUGE scores using the python package (https://pypi.org/project/py-rouge/).

    Args:
        cand (list or str): If `is_input_files` is `False`, `cand` is a list of strings
            containing predicted summaries. if `is_input_files` is `True`, `cand` is the path
            to the file containing the predicted summaries.
        ref (list or str): If `is_input_files` is `False`, `cand` is a list of strings
            containing reference summaries. if `is_input_files` is `True`, `cand` is the path
            to the file containing the reference summaries.
        is_input_files (bool, optional): If True, inputs are file names. Otherwise, inputs are
            lists of predicted and reference summaries. Defaults to False.
        language (str, optional): Language of the input text. Supported values are "en" and
            "hi". Defaults to "en".

    Returns:
        dict: Dictionary of ROUGE scores.

    """
    supported_langauges = ["en", "hi"]
    if language not in supported_langauges:
        raise Exception(
            "Language {0} is not supported. Supported languages are: {1}.".
            format(language, supported_langauges))

    if is_input_files:
        candidates = [line.strip() for line in open(cand, encoding="utf-8")]
        references = [line.strip() for line in open(ref, encoding="utf-8")]
    else:
        candidates = cand
        references = ref

    print("Number of candidates: {}".format(len(candidates)))
    print("Number of references: {}".format(len(references)))
    assert len(candidates) == len(references)

    if language == "en":
        evaluator = Rouge()
    else:
        evaluator = RougeExt(
            metrics=["rouge-n", "rouge-l"],
            max_n=2,
            limit_length=False,
            apply_avg=True,
            language=language,
        )

    scores = evaluator.get_scores(candidates, references)

    return scores
Ejemplo n.º 9
0
def evaluate(model, data, model_name='trs', ty='valid', verbose=True):
    hyp_g, ref, r1, r2, rl, r_avg = [], [], [], [], [], []
    t = Translator(model)
    rouge = Rouge()

    l, loss = [], None
    pbar = tqdm(enumerate(data), total=len(data))
    for j, batch in pbar:
        if ty != "test":
            loss = model.train_one_batch(batch, train=False)
            l.append(loss.item())

        if ((j <= 1 and ty != "test") or ty == "test"):
            if ty != 'test':
                sent_g = model.decoder_greedy(
                    batch)  # 1-decoder generation. for testing
            else:
                sent_g = model.eval_one_batch(batch)  # 2-decoder generation.
            # sent_b, _ = t.translate_batch(batch) # beam search

            for i, sent in enumerate(sent_g):
                hyp_g.append(sent)
                ref.append(batch["target_txt"][i])
                rouges = rouge.get_scores(
                    sent, batch["target_txt"][i])[0]  # (hyp, ref)

                r1_val, r2_val, rl_val = rouges['rouge-1']["f"], rouges[
                    'rouge-2']["f"], rouges['rouge-l']["f"]
                r1.append(r1_val)
                r2.append(r2_val)
                rl.append(rl_val)
                r_avg.append(np.mean([r1_val, r2_val, rl_val]))
        pbar.set_description("EVAL loss:{:.4f} r_avg:{:.2f}".format(
            np.mean(l), np.mean(r_avg)))
        if (j > 1 and ty == "train"): break
    if l: loss = np.mean(l)
    r_avg = np.mean(r_avg)
    r1 = np.mean(r1)
    r2 = np.mean(r2)
    rl = np.mean(rl)

    if (verbose):
        print(
            "\nEVAL loss: {:.4f} r_avg: [{:.2f}] r1: {:.2f} r2: {:.2f} rl: {:.2f}"
            .format(loss, r_avg, r1, r2, rl))
        for hyp, gold in zip(hyp_g, ref):
            print("HYP: ")
            print(hyp)
            print("GOLD: ")
            print(gold)
    return loss, r_avg
Ejemplo n.º 10
0
def eval(gt_text, arg_text, non_arg_text=None):
    if non_arg_text:
        length_arg = len(arg_text)
        length_no_arg = len(non_arg_text)
        fpr_values = []
        for arg_length in [220, 330, 440]:
            ratio_arg = arg_length / length_arg
            ratio_no_arg = (660 - arg_length) / length_no_arg
            if ratio_arg > 0.3:
                summary_arg = extractive_summary(arg_text, min(ratio_arg, 1))
            else:
                summary_arg = extractive_summary(arg_text, ratio_arg, 20, 200)
            summary_no_arg = extractive_summary(non_arg_text, ratio_no_arg, 20,
                                                200)
            summary = summary_no_arg + summary_arg
            rouge = Rouge()
            score = rouge.get_scores(summary, gt_text)
            print(summary)
            print(score[0]['rouge-1'])
            sco = score[0]['rouge-1']
            fpr_values.append(sco['f'])
            fpr_values.append(sco['p'])
            fpr_values.append(sco['r'])
        return fpr_values
    else:
        summary = arg_text
        #length = len(arg_text)
        #ratio = 665/length
        #if ratio > 0.3:
        #    summary = extractive_summary(arg_text, min(ratio,1))
        #else:
        #    summary = extractive_summary(arg_text, ratio, 20, 200)
    print(summary)
    rouge = Rouge()
    score = rouge.get_scores(summary, gt_text)
    print(score)
    sco = score[0]['rouge-1']
    return sco['f'], sco['p'], sco['r']
Ejemplo n.º 11
0
def get_rouge_score(result_rouge_list):
    # 0:输入句子,1 表示输出句子 2-11表示生成句子 12 表示 truth nll  一般都取f
    rouge_1_f = 0.0
    rouge_2_f = 0.0
    rouge_l_f = 0.0
    rouge = Rouge()
    for line in result_rouge_list:
        rouge_score = rouge.get_scores(hyps=line[2], refs=line[1])
        rouge_1_f += rouge_score[0]["rouge-1"]['f']
        rouge_2_f += rouge_score[0]["rouge-2"]['f']
        rouge_l_f += rouge_score[0]["rouge-l"]['f']
    print("rouge_1_f : ", rouge_1_f / len(result_rouge_list))
    print("rouge_2_f : ", rouge_2_f / len(result_rouge_list))
    print("rouge_l_f : ", rouge_l_f / len(result_rouge_list))
Ejemplo n.º 12
0
 def __init__(self,
              model,
              test_data,
              key,
              batch=100,
              device=-1,
              max_length=100):
     self.model = model
     self.test_data = test_data
     self.key = key
     self.batch = batch
     self.device = device
     self.max_length = max_length
     self.rouge = Rouge()
Ejemplo n.º 13
0
def calculate_rouge(hypothesis, reference):
    """eval methods 支援list[str]格式"""
    rouge = Rouge()

    # print(f"hypothesis: {hypothesis}")
    # print(f"reference: {reference}")
    hypothesis = [" ".join(h) for h in hypothesis]
    reference = [" ".join(r) for r in reference]

    # hypothesis = ["the #### transcript is a written version of each day's cnn student news program use this transcript to help students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you saw on cnn student news"]
    # reference = ["this page includes the show transcript use the transcript to help students with reading comprehension andvocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teacher or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests students ' knowledge of even ts in the news"]

    scores = rouge.get_scores(hypothesis, reference, avg=True)
    return scores
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(description='Rouge Metric Calculator')
    parser.add_argument('-f', '--file', help="File mode", action='store_true')
    parser.add_argument('-a',
                        '--avg',
                        help="Average mode",
                        action='store_true')
    parser.add_argument('--ignore_empty',
                        action='store_true',
                        help="Ignore empty hypothesis")
    parser.add_argument('hypothesis', type=str, help='Text of file path')
    parser.add_argument('reference', type=str, help='Text or file path')
    parser.add_argument("--metrics",
                        nargs="+",
                        type=str.upper,
                        choices=METRICS_CHOICES.keys(),
                        help="Metrics to use (default=all)")
    parser.add_argument("--stats",
                        nargs="+",
                        type=str.upper,
                        choices=STATS_CHOICES,
                        help="Stats to use (default=all)")

    args = parser.parse_args()

    metrics = args.metrics
    stats = args.stats

    if metrics is not None:
        metrics = [METRICS_CHOICES[m] for m in args.metrics]

    if args.file:
        hyp, ref = args.hypothesis, args.reference
        assert (os.path.isfile(hyp))
        assert (os.path.isfile(ref))

        files_rouge = FilesRouge(hyp, ref, metrics, stats)
        scores = files_rouge.get_scores(avg=args.avg,
                                        ignore_empty=args.ignore_empty)

        print(json.dumps(scores, indent=2))
    else:
        hyp, ref = args.hypothesis, args.reference
        assert (type(hyp) == str)
        assert (type(ref) == str)

        rouge = Rouge(metrics, stats)
        scores = rouge.get_scores(hyp, ref, avg=args.avg)

        print(json.dumps(scores, indent=2))
Ejemplo n.º 15
0
 def __init__(self, embedding_sz=5):
     self.encoder_model = SentenceTransformer('bert-base-nli-mean-tokens')
     self.rouge = Rouge()
     self.cluster_n = 5
     self.embedding_sz = embedding_sz
     self.kmeans = KMeans(n_clusters=self.cluster_n)
     self.cv = CountVectorizer(strip_accents='ascii',
                               token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',
                               lowercase=True,
                               stop_words='english')
     self.legal_classes = {}
     self.stop_words = set(stopwords.words('english'))
     self.naive_bayes_model = MultinomialNB()
     self.legal_class_list = []
Ejemplo n.º 16
0
def test_rouge(temp_dir, cand, ref):
    candidates = [line.strip() for line in open(cand, encoding="utf-8")]
    references = [line.strip() for line in open(ref, encoding="utf-8")]

    assert len(candidates) == len(references)

    cnt = len(candidates)
    current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
    tmp_dir = os.path.join(temp_dir, "rouge-tmp-{}".format(current_time))

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir)
        os.mkdir(tmp_dir + "/candidate")
        os.mkdir(tmp_dir + "/reference")
    try:
        for i in range(cnt):
            if len(references[i]) < 1:
                continue
            with open(tmp_dir + "/candidate/cand.{}.txt".format(i),
                      "w",
                      encoding="utf-8") as f:
                f.write(candidates[i])
            with open(tmp_dir + "/reference/ref.{}.txt".format(i),
                      "w",
                      encoding="utf-8") as f:
                f.write(references[i])

        candidates_, references_ = [], []
        for cand, ref in zip(candidates, references):
            if len(cand) * len(ref) != 0:
                cand = " ".join(cand.split("<q>"))
                ref = " ".join(ref.split("<q>"))
                candidates_.append(cand)
                references_.append(ref)

        # calculate rouge score
        logger.info(
            "| Calculating rouge score on {} candidate-reference pairs ...".
            format(len(candidates_)))

        rouge = Rouge()
        rouge_result_dict = rouge.get_scores(candidates_,
                                             references_,
                                             avg=True)

    finally:
        if os.path.isdir(tmp_dir):
            shutil.rmtree(tmp_dir)

    return rouge_result_dict
def eval(model, iter):

	model.eval()
	total_loss = 0
	rouge_scores  = []
	loss_list = []

	r = Rouge()
	step = 0



	with torch.no_grad():

		for i, b in enumerate(iter):
			a = b.ARTICLE
			s = b.SUMMARY
			output, mu, logvar, z = model(a, s)
			out = output[1:].view(-1, output.shape[-1])
			summary = s[1:].view(-1)
			NLL, KL, w = model.KL_loss(out.float(), mu, logvar, summary, step)
			step += 1
			loss = NLL + w * KL
			total_loss += loss.item()

			if i % 10 == 0:
				# print("Loss at iteration " + str(i) +  ": ", loss.item())
				loss_list.append(loss.item())

			"""
			if i > 1:
				preds = convert_to_text(output, model.text.vocab)
				labels = convert_to_text(s, model.text.vocab, False)
				preds, labels = detokenize(preds, labels)

				for i in range(len(preds)):
					rouge_scores.append([i, preds[i], labels[i], r.get_scores(preds[i], labels[i])])

				return total_loss / len(iter), loss_list, rouge_scores
			"""
		preds = convert_to_text(output, model.text.vocab)
		labels = convert_to_text(s, model.text.vocab, False)
		preds, labels = detokenize(preds, labels)


		for i in range(len(preds)):
			rouge_scores.append([i, preds[i], labels[i], r.get_scores(preds[i], labels[i])])


	return total_loss/len(iter), loss_list, rouge_scores
Ejemplo n.º 18
0
def calc_metrics(refs, hyps, metric):
    print("Count:", len(hyps))
    print("Ref:", refs[-1])
    print("Hyp:", hyps[-1])

    if metric in ("bleu", "all"):
        from nltk.translate.bleu_score import corpus_bleu
        print("BLEU: ", corpus_bleu([[r] if r is not list else r for r in refs], hyps))
    if metric == "legacy_rouge":
        print(calc_legacy_rouge(refs, hyps))
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(hyps, refs, avg=True)
        print("ROUGE: ", scores)
Ejemplo n.º 19
0
def cnndm_eval(ratio):
    """Evaluation for the CNN/DailyMail dataset"""
    for func_name in BASELINES:
        print("Evaluating \"{}\"".format(func_name))
        print("=" * 20)
        with open(str(RESULTS_DIR / "cnndm_baseline_{}_{}.pred".format(
                func_name, int(ratio * 100)))) as fin:
            predictions = fin.read().split("\n")[:-1]
        with open(str(DATA_DIR / "cnndm" / "test.txt.tgt.tagged")) as fin:
            references = fin.read().replace("<t> ", "").replace(
                "</t> ", "").split("\n")[:-1]
        assert all([len(x) > 0 for x in predictions])
        scores = Rouge().get_scores(predictions, references, avg=True)
        pprint(scores)
Ejemplo n.º 20
0
def evaluate(val_data, train_data, model, encoder_layer, decoder_layer, n_output_tokens):
    rouge = Rouge()
    input_texts, target_texts, input_sequences, target_sequences = train_data
    
    train_target_texts, _, train_target_sequences, _ = train_data
    max_decoder_seq_length = max([len(txt) for txt in train_target_texts])
    target_token_index = dict([(char, i) for i, char in enumerate(train_target_sequences)])
    reverse_target_token_index = dict(
        (i, char) for char, i in target_token_index.items()
    )

    X1_test, X2_test, y_test = preprocessing(train_data)

   # example runs
    for idx in range(3):
        input_seq = X1_test[idx : idx + 1]
        predicted = predict_sequence(
            encoder_layer,
            decoder_layer,
            input_seq,
            n_output_tokens,
            reverse_target_token_index,
            max_decoder_seq_length,
        )

        print("- -- -")
        # print("Input sentence:", "\n".join(input_texts[idx]))
        print("Ground truth sentence:", "\n".join(target_texts[idx]))
        print("Decoded sentence:", predicted)
        print("- -- -")
    
    # evaluate LSTM
    total = len(X1_test)
    refs = []
    hyps = []
    for idx in range(total):
        input_seq = X1_test[idx : idx + 1]
        predicted_summ = predict_sequence(
            encoder_layer,
            decoder_layer,
            input_seq,
            n_output_tokens,
            reverse_target_token_index,
            max_decoder_seq_length,
        )
        refs.append("\n".join(input_texts[idx]))
        hyps.append(predicted_summ)

    rouge_scores = rouge.get_scores(hyps, refs, avg=True)
    print("Average ROUGE Score: {}".format(rouge_scores))
Ejemplo n.º 21
0
def main():
    hyp = 'hyp'
    raw_ref = 'duc2002_summaries_I'
    FJoin = os.path.join
    files_hyp = [FJoin(hyp, f) for f in os.listdir(hyp)]
    files_raw_ref = [FJoin(raw_ref, f) for f in os.listdir(hyp)]

    f_hyp = []
    f_raw_ref = []
    print("number of document: ", len(files_hyp))
    for file in files_hyp:
        f = open(file)
        f_hyp.append(f.read())
        f.close()
    for file in files_raw_ref:
        f = open(file)
        f_raw_ref.append(f.read())
        f.close()

    # import pdb
    # pdb.set_trace()
    rouge_1_tmp = []
    rouge_2_tmp = []
    rouge_L_tmp = []
    number = 1
    for hyp, ref in zip(f_hyp, f_raw_ref):
        try:
            rouge = Rouge()
            scores = rouge.get_scores(hyp, ref, avg=True)
            rouge_1 = scores["rouge-1"]["r"]
            rouge_2 = scores["rouge-2"]["r"]
            rouge_L = scores["rouge-l"]["r"]
            rouge_1_tmp.append(rouge_1)
            rouge_2_tmp.append(rouge_2)
            rouge_L_tmp.append(rouge_L)
            # import pdb; pdb.set_trace()
            # print(number)
            print(scores)
        except Exception:
            pass
        # number +=1
    rouge_1_avg = sta.mean(rouge_1_tmp)
    rouge_2_avg = sta.mean(rouge_2_tmp)
    rouge_L_avg = sta.mean(rouge_L_tmp)
    print('Rouge-1')
    print(rouge_1_avg)
    print('Rouge-2')
    print(rouge_2_avg)
    print('Rouge-L')
    print(rouge_L_avg)
Ejemplo n.º 22
0
    def _eval_epoch(mode):
        if mode == 'val':
            data_iterator.switch_to_val_data()
        else:
            data_iterator.switch_to_test_data()
        model.eval()

        refs, hypos = [], []
        evalStart = 0
        for batch in data_iterator:

            infer_outputs = model(batch, mode="val")
            output_ids = infer_outputs["sample_id"][:, :, 0].cpu()
            target_texts_ori = [text[1:] for text in batch['target_text']]
            target_texts = tx.utils.strip_special_tokens(target_texts_ori,
                                                         is_token_list=True)
            output_texts = tx.data.vocabulary.map_ids_to_strs(
                ids=output_ids, vocab=val_data.target_vocab)

            if (evalStart == 0):
                src_words = model.source_embedder(batch['source_text_ids'])
                print('src wrds', src_words[0, 0:2, 0:10].flatten(),
                      target_texts[0])
                evalStart = 1
            else:
                pass

            for hypo, ref in zip(output_texts, target_texts):
                #ADD ROUGE
                if config_data.eval_metric == 'bleu':
                    hypos.append(hypo)
                    refs.append([ref])
                elif config_data.eval_metric == 'rouge':
                    hh = str(hypo)
                    if (len(hh) == 0):
                        hh = " "
                    rr = ' '.join(ref)
                    hypos.append(hh)
                    refs.append(rr)


#ADD ROUGE
        if config_data.eval_metric == 'bleu':
            return tx.evals.corpus_bleu_moses(list_of_references=refs,
                                              hypotheses=hypos)
        elif config_data.eval_metric == 'rouge':
            rouge = Rouge()
            print('HH', type(hypos), type(hypos[0]), hypos[0])
            print('RR', type(refs), type(refs[0]), refs[0])
            return rouge.get_scores(hyps=hypos, refs=refs, avg=True)
Ejemplo n.º 23
0
def rouge_eval2(ref_dir, dec_dir):
    output = []

    for f in sorted(glob.glob(ref_dir + '/*')):
        with open(f, 'r') as file:
            output.append({'ref': ' '.join(file.readlines())})

    for idx, f in enumerate(sorted(glob.glob(dec_dir + '/*'))):
        with open(f, 'r') as file:
            output[idx]['hyp'] = ' '.join(file.readlines())

    hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in output]))
    rouge = Rouge()
    return rouge.get_scores(hyps, refs, avg=True)
 def _get_rouge_score(sentence_a, sentence_b):
     rouge = Rouge()
     weight = [0.33, 0.33, 0.33]  # 分别代表rouge-1, rouge-2, rouge-l所占比重。
     sentence_a = " ".join([word for word in sentence_a])
     sentence_b = " ".join([word for word in sentence_b])
     try:
         scores = rouge.get_scores(sentence_a, sentence_b, avg=True)
     except Exception:
         return 0
     rouge_1 = scores["rouge-1"]
     rouge_2 = scores["rouge-2"]
     rouge_l = scores["rouge-l"]
     f = rouge_1["f"] * weight[0] + rouge_2["f"] * weight[0] + rouge_l["f"] * weight[0]
     return f
Ejemplo n.º 25
0
def cal_Rouge(results, examples):
    rouge = Rouge(metrics=["rouge-l"])
    pred_answers = []
    answers = []
    for example in examples:
        pred_answer = results[example.qid][1]
        answers.append(example.answer)
        pred_answers.append(pred_answer)

    scores = rouge.get_scores(pred_answers, answers, avg=True)
    Rouge_L = scores["rouge-l"]["f"]
    P = scores["rouge-l"]["p"]
    R = scores["rouge-l"]["r"]
    return Rouge_L, P, R
Ejemplo n.º 26
0
def eval(preds, targets, avg=True):
    rouge = Rouge()
    scores = rouge.get_scores(preds, targets, avg)

    rouge2_f_metric = scores['rouge-2']['f']
    rouge2_p_metric = scores['rouge-2']['p']
    rouge2_r_metric = scores['rouge-2']['r']
    rougel_f_metric = scores['rouge-l']['f']
    rougel_p_metric = scores['rouge-l']['p']
    rougel_r_metric = scores['rouge-l']['r']

    return rouge2_f_metric,rougel_p_metric, \
           rouge2_r_metric,rouge2_f_metric, \
           rouge2_p_metric,rougel_f_metric
Ejemplo n.º 27
0
 def get_metric(self, reset=True):
     logger.info("[INFO] Hyps and Refer number is %d, %d",
                 len(self.prediction), len(self.referece))
     if len(self.prediction) == 0 or len(self.referece) == 0:
         logger.error("During testing, no hyps or refers is selected!")
         return
     rouge = Rouge()
     scores_all = rouge.get_scores(self.prediction, self.referece, avg=True)
     if reset:
         self.prediction = []
         self.referece = []
     logger.info(scores_all)
     scores_all = remend_score(scores_all)
     return scores_all
Ejemplo n.º 28
0
def rouge_score(data):
    rouge = Rouge()
    scores = {'rouge-1': [], 'rouge-2': [], 'rouge-l': []}
    for refs, sys in data:
        for score in rouge.get_scores([sys] * len(refs), refs):
            scores['rouge-1'].append(score['rouge-1']['f'])
            scores['rouge-2'].append(score['rouge-2']['f'])
            scores['rouge-l'].append(score['rouge-l']['f'])
    n = len(scores['rouge-1'])
    if n > 0:
        return sum(scores['rouge-1']) / n, sum(scores['rouge-2']) / n, sum(
            scores['rouge-l']) / n
    else:
        return 0, 0, 0
Ejemplo n.º 29
0
def evaluate(params):
    gen = test(params)
    reals = []
    preds = []
    with tqdm(total=params["max_num_to_eval"], position=0, leave=True) as pbar:
        for i in range(params["max_num_to_eval"]):
            trial = next(gen)
            reals.append(trial.real_abstract)
            preds.append(trial.abstract)
            pbar.update(1)
    r = Rouge()
    scores = r.get_scores(preds, reals, avg=True)
    print("\n\n")
    pprint.pprint(scores)
def compute_rouge_scores(pred_seq, target_seq):
    """
    :param pred_seq: Predicted sequence
    :param target_seq: Target sequence
    :return: a pair (rouge_2, rouge_l) containing the rouge-2 and rouge-l scores given pred_seq
    and target_seq
    """
    rouge = Rouge()
    pred_seq_str = ' '.join([str(x) for x in pred_seq])
    target_seq = ' '.join([str(x) for x in target_seq])

    scores = rouge.get_scores(pred_seq_str, target_seq)

    return scores[0]['rouge-2']['f'], scores[0]['rouge-l']['f']