Example #1
0
    def test_bleu(self):
        bleu = BLEUCalculator()
        score = bleu.bleu(
            "I am waiting on the beach",
            "He is walking on the beach",
        )
        score_from_list = bleu.bleu("I am waiting on the beach".split(),
                                    ["He is walking on the beach".split()])
        self.assertLess(abs(score - score_from_list), 1e-8)

        bleu = BLEUCalculator(lang="ja")
        score_ja = bleu.bleu("私はビーチで待ってる", "彼がベンチで待ってる")

        self.assertLess(abs(score - score_ja), 1e-8)
    def convert_excel_to_df_and_evaluate(self):
        """エクセルをDataFrameへ"""
        input_df = pd.ExcelFile(self.input_file).parse()

        bleu_ja = BLEUCalculator(lang="ja")
        rouge = RougeCalculator(lang="ja")
        output_dict = {
            'bleu': [],
            'rouge_1': [],
            'rouge_2': [],
            'rouge_long': []
        }
        for index, row in input_df.iterrows():
            output_dict['bleu'].append(
                bleu_ja.bleu(row['ref_text'], row['input_text']))  # BLEUでの評価
            output_dict['rouge_1'].append(
                rouge.rouge_n(summary=row['ref_text'],
                              references=row['input_text'],
                              n=1))  # ROUGE(n1)での評価
            output_dict['rouge_2'].append(
                rouge.rouge_n(summary=row['ref_text'],
                              references=row['input_text'],
                              n=2))  # ROUGE(n2)での評価
            output_dict['rouge_long'].append(
                rouge.rouge_l(
                    summary=row['ref_text'],
                    references=row['input_text']))  # ROUGE(rouge_l)での評価
            # input_microphone_df['rouge_be'] = rouge.rouge_be(summary=row['ref_text'], references=row['input_text'])      # ROUGE(rouge_be)での評価

        input_df['bleu'] = output_dict['bleu']
        input_df['rouge_1'] = output_dict['rouge_1']
        input_df['rouge_2'] = output_dict['rouge_2']
        input_df['rouge_long'] = output_dict['rouge_long']
        # print(input_df)
        return input_df
def eval_rouges(refrence_summary, model_summary):
    # refrence_summary = "tokyo shares close up #.## percent"
    # model_summary = "tokyo stocks close up # percent to fresh record high"

    rouge = RougeCalculator(stopwords=True, lang="en")

    rouge_1 = rouge.rouge_n(
        summary=model_summary,
        references=refrence_summary,
        n=1)

    rouge_2 = rouge.rouge_n(
        summary=model_summary,
        references=[refrence_summary],
        n=2)

    rouge_l = rouge.rouge_l(
        summary=model_summary,
        references=[refrence_summary])

    # You need spaCy to calculate ROUGE-BE

    rouge_be = rouge.rouge_be(
        summary=model_summary,
        references=[refrence_summary])

    bleu = BLEUCalculator()
    bleu_score = bleu.bleu(summary=model_summary,
                           references=[refrence_summary])

    # print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
    #    rouge_1, rouge_2, rouge_l, rouge_be
    # ).replace(", ", "\n"))

    return rouge_1, rouge_2, rouge_l, rouge_be, bleu_score
Example #4
0
def main(args):
    system_out = read_file(args.system_output)
    reference_list = read_file(args.reference)
    bleu = BLEUCalculator()
    bleu_list = []
    for index, snt in enumerate(system_out):
        bleu_list.append(
            bleu.bleu(summary=snt, references=reference_list[index]))
    print('SACRE_BLEU\t%.6f' % (np.average(bleu_list)))
Example #5
0
def evaluate_bleu(summary, references, lang="zh"):
    bleu_calc = BLEUCalculator(lang=lang)
    assert len(summary) == len(references), "number of summary and references should be equal"

    scores = []
    for s, rs in zip(summary, references):
        score = bleu_calc.bleu(s, rs)
        scores.append(score)
    score_avg = sum(scores) /  len(scores)
    return score_avg, scores
def cal_bleu(prediction_str, target_str):
    bleu = BLEUCalculator()
    total_bleu = []
    for index in range(len(prediction_str)):
        prediction_rel = ' '.join(prediction_str[index])
        eos_index = prediction_rel.find('<eos>')
        if (eos_index > 0):
            prediction_rel = prediction_rel[:eos_index - 1]
        target_rel = ' '.join(target_str[index])
        target_rel = target_rel[:target_rel.find('<eos>') - 1]
        total_bleu.append(bleu.bleu(prediction_rel, target_rel))
    return np.mean(total_bleu)
Example #7
0
def myeval(valid_x, valid_y, vocab, model):
    rouge = RougeCalculator(stopwords=True, lang="zh")
    bleu_ch = BLEUCalculator(lang="zh")

    model.eval()
    eval_batch_num = 0
    sum_rouge_1 = 0
    sum_rouge_2 = 0
    sum_rouge_L = 0
    score_ch = 0
    sum_loss = 0
    limit = 63
    logging.info('Evaluating on %d minibatches...' % limit)
    i2w = {key: value for value, key in vocab.items()}
    ckpt_file = args.ckpt_file[9:]
    fout_pred = open(os.path.join('tmp/systems', '%s.txt' % ckpt_file), "w")
    fout_y = open(os.path.join('tmp/models', 'ref_%s.txt' % ckpt_file), "w")
    while eval_batch_num < limit:
        with torch.no_grad():
            loss = run_batch(valid_x, valid_y, model)
            sum_loss += loss
            _, x = valid_x.next_batch()
            pred = greedy(model, x, vocab)
            _, y = valid_y.next_batch()
            y = y[:,1:].tolist()
            for idx in range(len(pred)):
                line_pred = [i2w[tok] for tok in pred[idx] if tok != vocab[config.end_tok] and tok != vocab[config.pad_tok]]
                line_y = [i2w[tok] for tok in y[idx] if tok != vocab[config.end_tok] and tok != vocab[config.pad_tok]]
                fout_pred.write(" ".join(line_pred) + "\n")
                fout_y.write(" ".join(line_y) + "\n")
                sum_rouge_1 += rouge.rouge_n(references=" ".join(line_y),summary=" ".join(line_pred),n=1)
                sum_rouge_2 += rouge.rouge_n(references=" ".join(line_y),summary=" ".join(line_pred),n=2)
                sum_rouge_L += rouge.rouge_l(references=" ".join(line_y),summary=" ".join(line_pred))
                score_ch += bleu_ch.bleu(" ".join(line_y), " ".join(line_pred))
            eval_batch_num += 1
    fout_pred.close()
    fout_y.close()
    avg_rouge_1 = sum_rouge_1/(len(pred) * limit)
    avg_rouge_2 = sum_rouge_2/(len(pred) * limit)
    avg_rouge_L = sum_rouge_L/(len(pred) * limit)
    avg_bleu_ch = score_ch/(len(pred) * limit)
    avg_loss = sum_loss/limit
    print("ROUGE_1 = ",avg_rouge_1)
    print("ROUGE_2 = ",avg_rouge_2)
    print("ROUGE_L = ",avg_rouge_L)
    print("BLEU = ", avg_bleu_ch)
    print("Perplexity = ", math.pow(2, avg_loss))
    model.train()
class SacreBleu():
    def __init__(self):
        self.bleu_calc = BLEUCalculator()

    def compute_reward(self, samples, sequence, model):
        references = [
            pair.get_text(pair.full_target_tokens,
                          model.vocab).split(" EOS")[0] for pair in samples
        ]
        summaries = [
            " ".join([str(token) for token in s]).split(" EOS")[0]
            for s in sequence
        ]
        scores = []
        for i in range(len(references)):
            scores.append(
                self.bleu_calc.bleu(summaries[i], references[i]) / 100)
        return scores
Example #9
0
def computeSacreBleu(translation_path,
                     reference_path,
                     lang,
                     detokenize_trans=True,
                     detokenize_ref=False):
    bleu = BLEUCalculator(lang=lang)
    trans_raw = trans = readSentences(translation_path)
    reference_raw = reference = readSentences(reference_path)
    if detokenize_trans or detokenize_ref:
        detok = MosesDetokenizer(lang)

        if detokenize_trans:
            trans = [detok([d]) for d in trans_raw]
        if detokenize_ref:
            reference = [detok([d]) for d in reference_raw]
    bleu_score = bleu.bleu(summary=trans,
                           references=[reference],
                           score_only=True)
    print(bleu_score)
    return bleu_score
Example #10
0
    def test_custom_lang(self):
        class Custom(BaseLang):
            def __init__(self):
                super(Custom, self).__init__("cs")

            def tokenize(self, text):
                return text.split("/")

        lang = Custom()
        rouge = RougeCalculator(lang=lang)
        rouge_score = rouge.rouge_n(
            summary="I/went/to/the/Mars/from/my/living/town.",
            references="I/went/to/Mars",
            n=1)

        bleu = BLEUCalculator(lang=lang)
        bleu_score = bleu.bleu("I/am/waiting/on/the/beach",
                               "He/is/walking/on/the/beach")

        self.assertGreater(rouge_score, 0)
        self.assertGreater(bleu_score, 0)
Example #11
0
class SumEvaluator:
    """Evaluator class for generation.
    A wrapper class of sumeval library
    """
    def __init__(self,
                 metrics: List[str] = [
                     "rouge_1", "rouge_2", "rouge_l", "rouge_be", "bleu"
                 ],
                 lang: str = "en",
                 stopwords: bool = True,
                 stemming: bool = True,
                 use_porter=True):
        if use_porter:
            self.rouge = RougeCalculator(stopwords=stopwords,
                                         stemming=stemming,
                                         lang="en-porter")
        else:
            self.rouge = RougeCalculator(stopwords=stopwords,
                                         stemming=stemming,
                                         lang="en")
        self.bleu = BLEUCalculator(lang=lang)
        self.metrics = sorted(metrics)

    def eval(self, true_gens: List[str], pred_gens: List[str]):

        assert len(true_gens) == len(pred_gens)

        eval_list = []
        colnames = []
        for i, (true_gen, pred_gen) in enumerate(zip(true_gens, pred_gens)):
            evals = []

            # BLEU
            if "bleu" in self.metrics:
                bleu_score = self.bleu.bleu(pred_gen,
                                            true_gen) / 100.0  # align scale
                evals.append(bleu_score)

            # ROUGE
            if "rouge_1" in self.metrics:
                rouge_1 = self.rouge.rouge_n(summary=pred_gen,
                                             references=[true_gen],
                                             n=1)
                evals.append(rouge_1)

            if "rouge_2" in self.metrics:
                rouge_2 = self.rouge.rouge_n(summary=pred_gen,
                                             references=[true_gen],
                                             n=2)
                evals.append(rouge_2)

            if "rouge_be" in self.metrics:
                rouge_be = self.rouge.rouge_be(summary=pred_gen,
                                               references=[true_gen])
                evals.append(rouge_be)

            if "rouge_l" in self.metrics:
                rouge_l = self.rouge.rouge_l(summary=pred_gen,
                                             references=[true_gen])
                evals.append(rouge_l)

            eval_list.append([pred_gen, true_gen] + evals)
        eval_df = pd.DataFrame(eval_list,
                               columns=["pred", "true"] + self.metrics)
        return eval_df
Example #12
0
#Evaluation task ##################################

from sumeval.metrics.rouge import RougeCalculator

rouge = RougeCalculator(stopwords=True, lang="en")

rouge_1 = rouge.rouge_n(summary="I went to the Mars from my living town.",
                        references="I went to Mars",
                        n=1)

rouge_2 = rouge.rouge_n(summary="I went to the Mars from my living town.",
                        references=["I went to Mars", "It's my living town"],
                        n=2)

rouge_l = rouge.rouge_l(summary="I went to the Mars from my living town.",
                        references=["I went to Mars", "It's my living town"])

# You need spaCy to calculate ROUGE-BE

rouge_be = rouge.rouge_be(summary="I went to the Mars from my living town.",
                          references=["I went to Mars", "It's my living town"])

print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
    rouge_1, rouge_2, rouge_l, rouge_be).replace(", ", "\n"))

from sumeval.metrics.bleu import BLEUCalculator

bleu = BLEUCalculator()
score = bleu.bleu("I am waiting on the beach", "He is walking on the beach")

bleu_ja = BLEUCalculator(lang="en")
            n=2))

rouge_2_scores.append(rouge.rouge_n(
            summary=summary,
            references=summaries[2],
            n=2))

rouge_2_scores.append(rouge.rouge_n(
            summary=summary,
            references=summaries[3],
            n=2))

rouge_2_scores.append(rouge.rouge_n(
            summary=summary,
            references=summaries[4],
            n=2))
mean_rouge_2_score = np.mean(rouge_2_scores)

bleu_scores = []
bleu_scores.append(bleu.bleu(summary=summary,
                  references=summaries[0]))
bleu_scores.append(bleu.bleu(summary=summary,
                  references=summaries[1]))
bleu_scores.append(bleu.bleu(summary=summary,
                  references=summaries[2]))
bleu_scores.append(bleu.bleu(summary=summary,
                  references=summaries[3]))
bleu_scores.append(bleu.bleu(summary=summary,
                  references=summaries[4]))
mean_bleu_scores = np.mean(bleu_scores)
Example #14
0
rouge_2_scores = []
rouge_2_scores.append(
    rouge.rouge_n(summary=generated_summary, references=summaries[0], n=2))

rouge_2_scores.append(
    rouge.rouge_n(summary=generated_summary, references=summaries[1], n=2))

rouge_2_scores.append(
    rouge.rouge_n(summary=generated_summary, references=summaries[2], n=2))

rouge_2_scores.append(
    rouge.rouge_n(summary=generated_summary, references=summaries[3], n=2))

rouge_2_scores.append(
    rouge.rouge_n(summary=generated_summary, references=summaries[4], n=2))
mean_rouge_2_score = np.mean(rouge_2_scores)

bleu_scores = []
bleu_scores.append(
    bleu.bleu(summary=generated_summary, references=summaries[0]))
bleu_scores.append(
    bleu.bleu(summary=generated_summary, references=summaries[1]))
bleu_scores.append(
    bleu.bleu(summary=generated_summary, references=summaries[2]))
bleu_scores.append(
    bleu.bleu(summary=generated_summary, references=summaries[3]))
bleu_scores.append(
    bleu.bleu(summary=generated_summary, references=summaries[4]))
mean_bleu_scores = np.mean(bleu_scores)
Example #15
0
        if abs((currentR - R).sum()) < 0.00001:
            break
    R[0] *= 3
    R[-1] *= 3
    max_index = np.argmax(R)
    if max_index == 0:
        count_max_0 += 1
    ans.append(sents[max_index])

    print(index)
    #    print(ref[index])
    #    print(ans[index])
    rouge_1 += rouge.rouge_n(summary=ans[index], references=ref[index], n=1)
    rouge_2 += rouge.rouge_n(summary=ans[index], references=ref[index], n=2)
    rouge_l += rouge.rouge_l(summary=ans[index], references=ref[index])
    bleu_score += bleu.bleu(ans[index], ref[index][0])

data_number -= empty
print('average rouge_1 = ', rouge_1 / data_number)
print('average rouge_2 = ', rouge_2 / data_number)
print('average rouge_L = ', rouge_l / data_number)
print('average bleu = ', bleu_score / data_number)
print('选择第一句的概率: ', count_max_0 / data_number)

rouge_1 = 0
rouge_2 = 0
rouge_l = 0
bleu_score = 0
from gensim.summarization.summarizer import summarize
count = 0
for story in stories: