Exemple #1
0
    def on_epoch_end(self, epoch, logs=None):
        # 训练过程中观察一两个例子
        print_sentence(s1, self.model)
        print_sentence(s2, self.model)

        summarization = test_df['summarization'].values
        text = test_df['text'].values
        pred = []
        for t, s in tqdm(zip(text, summarization)):
            pred.append(gen_sent(t, self.model))

        rouge_1 = rouge.Rouge().get_scores(
            pred, summarization.tolist())[0]['rouge-1']['f']
        rouge_2 = rouge.Rouge().get_scores(
            pred, summarization.tolist())[0]['rouge-2']['f']
        rouge_l = rouge.Rouge().get_scores(
            pred, summarization.tolist())[0]['rouge-l']['f']
        print('rouge-1:', rouge_1)
        print('rouge-2:', rouge_2)
        print('rouge-l:', rouge_l)

        # 保存最优模型
        if logs['loss'] <= self.lowest:
            self.lowest = logs['loss']
            model.save_weights('best_model.weights')
def evaluate_rouge(reference_path, generated_path, multi=True):
    # Evaluate model scores
    actual_word_lists = []
    multi_flag = True
    with open(reference_path) as f:
        for line in f:
            if multi:
                sents = line.strip().lower().split('#')
                actual_word_lists.append([x for x in sents])
            else:
                actual_word_lists.append([line.strip().lower()])
    generated_word_lists = []
    with open(generated_path) as f:
        for line in f:
            generated_word_lists.append(line.strip().lower())
    actual_word_lists = actual_word_lists[:len(generated_word_lists)]
    print(actual_word_lists[0],len(generated_word_lists))

    for aggregator in ['Avg', 'Best', 'Individual']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'
        import rouge
        evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                               max_n=4,
                               limit_length=True,
                               length_limit=100,
                               length_limit_type='words',
                               apply_avg=apply_avg,
                               apply_best=apply_best,
                               alpha=0.5, # Default F1_score
                               weight_factor=1.2,
                               stemming=True)
        all_hypothesis  = generated_word_lists
        all_references = actual_word_lists
        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                        print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
                print()
            else:
                print(prepare_results(metric, results['p'], results['r'], results['f']))




        rouge = Pythonrouge(summary_file_exist=False,
                        summary=generated_word_lists, reference=actual_word_lists,
                        n_gram=2, ROUGE_SU4=True, ROUGE_L=False,
                        recall_only=True, stemming=True, stopwords=True,
                        word_level=True, length_limit=True, length=50,
                        use_cf=False, cf=95, scoring_formula='average',
                        resampling=True, samples=1000, favor=True, p=0.5)
        score = rouge.calc_score()
        print(score)
Exemple #3
0
def compute_rouge_document(string1, string2):
    evaluator = rouge.Rouge(metrics=['rouge-n'],
                            max_n=2,
                            limit_length=True,
                            length_limit=1000)
    scores = evaluator.get_scores(string1, string2)
    return scores['rouge-1']['f']
Exemple #4
0
def test_rouge_metrics(candidates, references):
    for multiref in ["average", "best"]:
        # PERL 1.5.5 reference
        apply_avg = multiref == "average"
        apply_best = multiref == "best"
        evaluator = pyrouge.Rouge(
            metrics=["rouge-n", "rouge-l"],
            max_n=4,
            apply_avg=apply_avg,
            apply_best=apply_best,
            alpha=0.5,
            stemming=False,
            ensure_compatibility=False,
        )
        scores = evaluator.get_scores(candidates, references)

        lower_split_references = [
            [ref.lower().split() for ref in refs_per_candidate] for refs_per_candidate in references
        ]

        lower_split_candidates = [candidate.lower().split() for candidate in candidates]

        m = Rouge(variants=[1, 2, 4, "L"], multiref=multiref, alpha=0.5)
        for candidate, references_per_candidate in zip(lower_split_candidates, lower_split_references):
            m.update((candidate, references_per_candidate))
        results = m.compute()

        for key in ["1", "2", "4", "L"]:
            assert pytest.approx(results[f"Rouge-{key}-R"], abs=1e-4) == scores[f"rouge-{key.lower()}"]["r"]
            assert pytest.approx(results[f"Rouge-{key}-P"], abs=1e-4) == scores[f"rouge-{key.lower()}"]["p"]
            assert pytest.approx(results[f"Rouge-{key}-F"], abs=1e-4) == scores[f"rouge-{key.lower()}"]["f"]
Exemple #5
0
 def __init__(self,
              vocab=None,
              specials=[],
              type='n',
              n=1,
              max_length=None,
              tokenizer=None,
              alpha=0.5,
              which='f',
              tokenize_sent=False,
              eos='\n',
              dump_file=None):
     """Initalizes the Perplexity metrc."""
     self.vocab = vocab
     self.specials = specials
     self.n = n
     self.type = type
     self.rouge = rouge.Rouge(
         metrics=['rouge-' + type],
         max_n=n,
         limit_length=max_length is not None,
         length_limit=max_length,
         length_limit_type='words',
         apply_avg=True,
         apply_best=False,
         alpha=alpha,  # Default F1_score
         weight_factor=1.0,  # Correct bug
         stemming=True)
     self.tokenizer = tokenizer
     self.which = which
     self.tokenize_sent = tokenize_sent
     self.eos = eos
     self.dump_file = dump_file
     nltk.download('punkt', quiet=True)
Exemple #6
0
    def _test(metric_device):
        engine = Engine(update)
        m = Rouge(variants=[1, 2, "L"], alpha=0.5, device=metric_device)
        m.attach(engine, "rouge")

        engine.run(data=list(range(size)), max_epochs=1)

        assert "rouge" in engine.state.metrics

        evaluator = pyrouge.Rouge(
            metrics=["rouge-n", "rouge-l"],
            max_n=4,
            apply_avg=True,
            apply_best=False,
            alpha=0.5,
            stemming=False,
            ensure_compatibility=False,
        )
        rouge_1_f, rouge_2_f, rouge_l_f = (0, 0, 0)
        for candidate, references in data:
            scores = evaluator.get_scores([candidate], [references])
            rouge_1_f += scores["rouge-1"]["f"]
            rouge_2_f += scores["rouge-2"]["f"]
            rouge_l_f += scores["rouge-l"]["f"]

        assert pytest.approx(engine.state.metrics["Rouge-1-F"],
                             abs=1e-4) == rouge_1_f / len(data)
        assert pytest.approx(engine.state.metrics["Rouge-2-F"],
                             abs=1e-4) == rouge_2_f / len(data)
        assert pytest.approx(engine.state.metrics["Rouge-L-F"],
                             abs=1e-4) == rouge_l_f / len(data)
Exemple #7
0
def get_average_scores(hyps, refs, maxlen=400, stop_words=[]):
    rouge_scorer = rouge.Rouge()
    averaged_scores = {
        'rouge-1': {
            'f': 0,
            'p': 0,
            'r': 0
        },
        'rouge-2': {
            'f': 0,
            'p': 0,
            'r': 0
        },
        'rouge-l': {
            'f': 0,
            'p': 0,
            'r': 0
        }
    }

    scores = rouge_scorer.get_scores(hyps, refs)
    for metric in averaged_scores.keys():
        for values in scores:
            for sub_metric in averaged_scores[metric]:
                averaged_scores[metric][sub_metric] += values[metric][
                    sub_metric]
    for key in averaged_scores.keys():
        for sub_key in averaged_scores[key].keys():
            averaged_scores[key][sub_key] /= len(hyps)
    return averaged_scores
Exemple #8
0
def get_oracle_single_paper(batch):
    candidates = batch['candidates']
    references = batch['references']

    evaluator = rouge.Rouge()
    max_r1 = 0.
    max_score = None
    max_sent = ''

    def check_length(c):
        l = len(c)
        if l < 5 or l > 2500:
            return 0
        return 1

    # sentences with too long or too short of characters will break the script
    candidates = [c.strip() for c in candidates if check_length(c)]
    for tgt in references:
        ref = [tgt] * len(candidates)
        scores = evaluator.get_scores(candidates, ref)
        r1 = [s['rouge-1']['f'] for s in scores]
        max_idx = r1.index(max(r1))
        if max_r1 < r1[max_idx]:
            max_r1 = r1[max_idx]
            max_score = scores[max_idx]
            max_sent = candidates[max_idx]

    return max_sent.replace('\n', '').strip()
def main():
    # Read train-test-splits and get test ids
    splits = pd.read_csv(SPLITS_PATH, sep=";")
    test_ids = sorted(
        [int(fn[-3:]) for fn in splits[splits.SET == "TEST"].ID.values])

    #  Read data files from disk
    with open(ESSAYS_PATH, "r", encoding="utf-8") as f:
        data = json.load(f)
    with open(PREDICTIONS_PATH, "r", encoding="utf-8") as f:
        predictions = json.load(f)

    # Extract prediction labels
    # Also, make sure they are sorted based on the integer value of their id
    predictions = sorted(predictions, key=lambda x: int(x["id"]))
    predicted_texts = [p["prompt"] for p in predictions]

    # Extract true labels
    # Also, make sure they are sorted based on their id
    test_split = list(filter(lambda x: x["id"] in test_ids, data))
    test_split = sorted(test_split, key=lambda x: x["id"])
    true_prompts = [p['prompt'] for p in test_split]

    # Instantiate ROUGE evaluator
    evaluator = rouge.Rouge(metrics=['rouge-l'])

    print(len(true_prompts))
    print(len(predicted_texts))
    # Calculate the scores and print them nicely
    scores = evaluator.get_scores(predicted_texts, true_prompts, avg=True)
    print(scores)
def rougeScore(
    goldSummary, machineSummary, rowBuilder, averageArray
):  #Function generates ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-4, and ROUGE-L scores given a gold and machine-generated summary.
    #Create a new Rogue object using the default settings provided in documentation.
    rogueScores = rouge.Rouge(
        metrics=['rouge-n', 'rouge-l'],
        max_n=4,
        limit_length=False,
        apply_avg=False,
        apply_best=False,
        alpha=0.5,
        weight_factor=1.2,
        stemming=True
    )  #Use default parameters recommended by library developer.
    scores = rogueScores.get_scores(
        machineSummary, goldSummary)  #Generate the n-gram and rouge-l scores.
    rowBuilder.append(scores.get('rouge-1')[0].get('f')
                      [0])  #Add ROUGE-1 F1 value to the row.
    rowBuilder.append(scores.get('rouge-2')[0].get('f')
                      [0])  #Add ROUGE-2 F1 value to the row.
    rowBuilder.append(scores.get('rouge-3')[0].get('f')
                      [0])  #Add ROUGE-3 F1 value to the row.
    rowBuilder.append(scores.get('rouge-4')[0].get('f')
                      [0])  #Add ROUGE-4 F1 value to the row.
    rowBuilder.append(scores.get('rouge-l')[0].get('f')
                      [0])  #Add ROUGE-L F1 value to the row.

    #Add to existing running totals for average calculations.
    averageArray[0] = averageArray[0] + scores.get('rouge-1')[0].get('f')[0]
    averageArray[1] = averageArray[1] + scores.get('rouge-l')[0].get('f')[0]

    return rowBuilder
Exemple #11
0
def get_sorted(preds, refs, srcs):
    rouge_evaluator = rouge.Rouge(
        metrics=['rouge-n', 'rouge-l'],
        max_n=3,
        limit_length=True,
        length_limit=100,
        length_limit_type='words',
        apply_avg=False,
        apply_best=False,
        alpha=0.5,  # Default F1_score
        weight_factor=1.2,
        stemming=True)

    scores = rouge_evaluator.get_scores(preds, refs)
    all_f_scores = []
    for type in scores.keys():
        score_list = scores[type]
        f_scores = [x['f'] for x in score_list]
        all_f_scores.append(f_scores)
    all_f_scores = np.array(all_f_scores)
    mean_f_scores = np.mean(all_f_scores, axis=0)

    if srcs is not None:
        res = [{'mean_f': f.item(), 'pred': p, 'ref': r, 'src': s} \
               for f, p, r, s in zip(mean_f_scores, preds, refs, srcs)]
    else:
        res = [{'mean_f': f.item(), 'pred': p, 'ref': r} \
               for f, p, r in zip(mean_f_scores, preds, refs)]

    sorted_res = sorted(res, key=lambda x: x['mean_f'])
    return sorted_res
def calculate_rouge(eval_name, eval_set, summ_task, summ_type):
    """
    Evaluate generated summaries with py-rouge
    Expects two lists, one of reference summaries and the other
    of generated summaries where the indexes correspond.
    """
    evaluator = rouge.Rouge(
        metrics=['rouge-n', 'rouge-l'],
        max_n=3,
        limit_length=False,
        #length_limit=100,
        length_limit_type='words',
        apply_avg=True,
        apply_best=False,
        alpha=1,
        weight_factor=1.2,
        stemming=False)

    # eval_set[1] == gen_summaries
    # eval_set[0] == ref_summaries
    metrics = []
    rouge_scores = evaluator.get_scores(eval_set[1], eval_set[0])
    results_file.write("\nPerformance by {e} on {l}, {t}\n".format(
        e=eval_name, l=summ_task, t=summ_type))
    for metric, results in sorted(rouge_scores.items(), key=lambda x: x[0]):
        if metric == "rouge-l":
            result = results['r']
        else:
            result = results['r']
        metrics.append(result)
        results_file.write("{m}\t{r}\n".format(m=metric, r=result))
    return metrics
Exemple #13
0
def main():
    args = parser.parse_args()
    data = pd.read_excel(args.pred_file,
                         sheet_name=None,
                         header=None,
                         names=["Sentence", "Correct_answer", "Answer"])
    # select sheet with factoid questions
    # todo make as argument
    data = data["factoid"]
    evaluator = rouge.Rouge(
        metrics=["rouge-n", "rouge-l"],
        max_n=4,
        limit_length=True,
        length_limit=100,
        length_limit_type="words",
        apply_avg=True,
        apply_best=False,
        alpha=0.5,  # Default F1_score
        weight_factor=1.2,
        stemming=True,
    )

    with open(args.output, "w") as fout:
        ground_truth = [str(el) for el in data["Correct_answer"].values]
        predicted = [str(el) for el in data["Answer"].values]
        squad_f1 = squad_v1_f1([[el] for el in ground_truth], predicted)
        scores = evaluator.get_scores(predicted, ground_truth)
        fout.write(f"squad f1: {squad_f1:.3f}\n")
        for m in scores:
            fout.write(
                f'{m}:  p: {scores[m]["p"]:.3f} r: {scores[m]["r"]:.3f} f: {scores[m]["f"]:.3f}\n'
            )
Exemple #14
0
def run_rouge(generated, references, max_length):

    if isinstance(generated, list):
        assert isinstance(references, list)
        assert len(generated) == len(references)
    else:
        assert isinstance(generated, str)
        assert isinstance(references, str)

        generated = [generated]
        references = [references]

    # most args are from `run_summarization.py`
    rouge_evaluator = rouge.Rouge(
        metrics=["rouge-n", "rouge-l"],
        max_n=2,
        limit_length=True,
        length_limit=max_length,
        length_limit_type="words",
        apply_avg=True,
        apply_best=False,
        alpha=0.5,  # Default F1_score
        weight_factor=1.2,
        stemming=True,
    )

    scores = rouge_evaluator.get_scores(generated, references)
    return scores
def best_rouge_score(file, article, summary, greedy, beam):
    candidates = [beam[0][1:-1], beam[1][2:-1], beam[2][2:-2]]

    evaluator = rouge.Rouge(
        metrics=['rouge-n', 'rouge-l'],
        max_n=2,
        limit_length=True,
        length_limit=100,
        length_limit_type='words',
        apply_avg=False,
        apply_best=True,
        alpha=0.5,  # Default F1_score
        weight_factor=1.2,
        stemming=False)

    rouge_scores = []

    for candidate in candidates:
        all_hypothesis = [candidate]
        all_references = [summary]
        scores = evaluator.get_scores(all_hypothesis, all_references)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            # print(prepare_results(metric, results['p'], results['r'], results['f']))
            rouge_scores.append([candidate, round(results['f'] * 100.0, 2)])
            break

    rouge_scores.sort(key=lambda x: x[1], reverse=True)

    return rouge_scores[0][0], rouge_scores[0][1]
Exemple #16
0
def eval_rouge(instances: List[CFRInstance]):
    references = []
    hypotheses = []

    evaluator = rouge.Rouge(
        metrics=['rouge-n', 'rouge-l', 'rouge-w'],
        max_n=4,
        limit_length=True,
        length_limit=100,
        length_limit_type='words',
        apply_avg=True,
        apply_best=False,
        alpha=0.5,  # Default F1_score
        weight_factor=1.2,
        stemming=True)

    by_instance = []
    for instance in instances:
        _r = [_clean_text(g) for g in instance.gold_cf_endings]
        _h = _clean_text(instance.predicted_ending)
        references.append(_r)
        hypotheses.append(_h)
        try:
            by_instance.append(evaluator.get_scores(_h, _r))
        except:
            by_instance.append({})

    scores = evaluator.get_scores(hypotheses, references)
    return {
        'rouge_all': scores,
        #'rouge_by_instance': by_instance
    }
Exemple #17
0
 def __init__(self, config):
     """Initialization from the configuration"""
     self.mode = config.controller_mode
     self.model_name = config.model_name
     self.model_name_version = config.model_name + "_" + config.model_version
     self.start_epoch = config.start_epoch
     self.num_epoch = config.num_epoch
     self.write_output = config.write_output
     self.batch_size = config.batch_size
     self.print_interval = config.train_print_interval
     self.gpu_id = config.gpu_id
     self.drop_out = config.drop_out
     self.dec_start_id = config.dec_start_id
     self.dec_end_id = config.dec_end_id
     self.model_path = config.model_path
     self.output_path = config.output_path
     self.random_seed = config.random_seed
     self.bow_pred_method = config.bow_pred_method
     self.train_log = TrainingLog(config)
     self.id2word = None
     self.target_metrics = config.target_metrics
     self.lm_load_path = config.lm_load_path
     self.rouge_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'],
                                        max_n=2)
     self.save_ckpt = config.save_ckpt
     self.eval_metrics_list = config.eval_metrics_list
     self.log_metrics = config.log_metrics
     self.gumbel_samples = config.gumbel_samples
     self.is_gumbel = config.is_gumbel
     return
Exemple #18
0
    def create_binary_sentence_classification_dataset_with_rouge(self):
        """
        Create a dataset for training a sentence classification model, where the binary y labels are assigned based on the
        best rouge score for a sentence in the article when compared to each sentence in the summary
        """
        # Initiate rouge evaluator
        evaluator = rouge.Rouge(metrics=['rouge-l'],
                                max_n=3,
                                limit_length=False,
                                length_limit_type='words',
                                apply_avg=False,
                                apply_best=True,
                                alpha=1,
                                weight_factor=1.2,
                                stemming=False)

        bioasq_collection = self._load_bioasq()
        training_data_dict = {}  #
        snip_id = 0
        for i, q in enumerate(bioasq_collection):
            question = q
            for snippet in bioasq_collection[q]['snippets']:
                training_data_dict[snip_id] = {}
                labels = []
                # Sentencize snippet
                snippet_text = snippet['snippet']
                tokenized_snip = self.nlp(snippet_text)
                snippet_sentences = [
                    s.text.strip() for s in tokenized_snip.sents
                ]
                # Sentencize abstract
                abstract_text = snippet['article']
                tokenized_abs = self.nlp(abstract_text)
                abstract_sentences = [
                    s.text.strip() for s in tokenized_abs.sents
                ]
                rouge_scores = []
                for abs_sen in abstract_sentences:
                    best_rouge = 0
                    for snip_sen in snippet_sentences:
                        rouge_score = self.calculate_sentence_level_rouge(
                            snip_sen, abs_sen, evaluator)
                        if best_rouge < rouge_score:
                            best_rouge = rouge_score
                    if best_rouge > .9:
                        label = 1
                    else:
                        label = 0
                    labels.append(label)
                training_data_dict[snip_id]['question'] = q
                training_data_dict[snip_id]['sentences'] = abstract_sentences
                training_data_dict[snip_id]['labels'] = labels
                snip_id += 1

        with open(
                "data/bioasq_abs2summ_binary_sent_classification_training.json",
                "w",
                encoding="utf=8") as f:
            json.dump(training_data_dict, f, indent=4)
Exemple #19
0
def sentences_tokens_Rouge(sentences=None,
                           tokens=None,
                           n_gram=4,
                           metrics=None,
                           aggregator='Best'):
    """
    计算Rouge指标
    sentences: inference结果caption句子列表
    tokens: 原图对应token列表
    n_gram: N-grams for ROUGE-N
    metrics: What ROUGE score to compute. Available: ROUGE-N, ROUGE-L, ROUGE-W. Default: ROUGE-N   可组合。['rouge-n', 'rouge-l', 'rouge-w']
    aggregator: 'Avg', 'Best', 'Individual'
    :return:
    """

    print('Rouge evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(
        metrics=metrics,
        max_n=n_gram,
        limit_length=True,
        length_limit=100,
        length_limit_type='words',
        apply_avg=apply_avg,
        apply_best=apply_best,
        alpha=0.5,  # Default F1_score
        weight_factor=1.2,
        stemming=True)

    for hypothesis in sentences:
        hypothesis = " ".join(hypothesis)
        for reference in tokens:
            reference = " ".join(reference)
            scores = evaluator.get_scores(hypothesis, reference)

    print_score = True
    if print_score:
        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best:  # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        print('\tHypothesis #{} & Reference #{}: '.format(
                            hypothesis_id, reference_id))
                        print('\t' + prepare_results(
                            results_per_ref['p'][reference_id],
                            results_per_ref['r'][reference_id],
                            results_per_ref['f'][reference_id]))
                print()
            else:
                print(
                    prepare_results(results['p'], results['r'], results['f'],
                                    metric))
        print()

    return scores
    def __init__(self, rouge_type: str, name: str = "ROUGE") -> None:

        if rouge_type.lower() not in ["1", "2", "l"]:
            raise ValueError(("Invalid type of rouge metric '{}', "
                              "must be '1', '2' or 'L'").format(rouge_type))

        self.name = name
        self.rouge_type = rouge_type.lower()
        self.rouge = rouge.Rouge()
Exemple #21
0
 def __init__(self, rouge_weight, sari_weight, bleu_weight, eos_idx):
     import rouge as R
     self.rouge = R.Rouge(stats=["f"],
                          metrics=["rouge-1", "rouge-2", "rouge-l"])
     self.r_weight = rouge_weight
     self.s_weight = sari_weight
     self.b_weight = bleu_weight
     self.eos_idx = eos_idx
     self.smooth = SmoothingFunction()
def ROUGE_sentence_score(expected, actual):
    # TODO: fix this so that it actually gives results
    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'],
                            max_n=4,
                            limit_length=True,
                            length_limit=100,
                            length_limit_type='words',
                            apply_avg="Avg")
    return evaluator.get_scores([actual], [expected])
Exemple #23
0
def get_rouge(candidate, reference):
    candidate = [i for i in candidate]
    # print(np.mean([len(i.split()) for i in candidate]))
    reference = [[i] for i in reference]

    def prepare_results(m, p, r, f):
        return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(
            m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

    for aggregator in ['Avg', 'Best', 'Individual']:
        print('Evaluation with {}'.format(aggregator))
        apply_avg = aggregator == 'Avg'
        apply_best = aggregator == 'Best'

        evaluator = rouge.Rouge(
            metrics=['rouge-n', 'rouge-l', 'rouge-w'],
            max_n=4,
            limit_length=True,
            length_limit=100,
            length_limit_type='words',
            apply_avg=apply_avg,
            apply_best=apply_best,
            alpha=0.5,  # Default F1_score
            weight_factor=1.2,
            stemming=True)

        hypothesis_1 = "King Norodom Sihanouk has declined requests to chair a summit of Cambodia 's top political leaders , saying the meeting would not bring any progress in deadlocked negotiations to form a government .\nGovernment and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen 's party to form a new government failed .\nHun Sen 's ruling party narrowly won a majority in elections in July , but the opposition _ claiming widespread intimidation and fraud _ has denied Hun Sen the two-thirds vote in parliament required to approve the next government .\n"
        references_1 = [
            "Prospects were dim for resolution of the political crisis in Cambodia in October 1998.\nPrime Minister Hun Sen insisted that talks take place in Cambodia while opposition leaders Ranariddh and Sam Rainsy, fearing arrest at home, wanted them abroad.\nKing Sihanouk declined to chair talks in either place.\nA U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.\nBut in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.\nLeft out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians.",
            "Cambodian prime minister Hun Sen rejects demands of 2 opposition parties for talks in Beijing after failing to win a 2/3 majority in recent elections.\nSihanouk refuses to host talks in Beijing.\nOpposition parties ask the Asian Development Bank to stop loans to Hun Sen's government.\nCCP defends Hun Sen to the US Senate.\nFUNCINPEC refuses to share the presidency.\nHun Sen and Ranariddh eventually form a coalition at summit convened by Sihanouk.\nHun Sen remains prime minister, Ranariddh is president of the national assembly, and a new senate will be formed.\nOpposition leader Rainsy left out.\nHe seeks strong assurance of safety should he return to Cambodia.\n",
        ]

        hypothesis_2 = "China 's government said Thursday that two prominent dissidents arrested this week are suspected of endangering national security _ the clearest sign yet Chinese leaders plan to quash a would-be opposition party .\nOne leader of a suppressed new political party will be tried on Dec. 17 on a charge of colluding with foreign enemies of China '' to incite the subversion of state power , '' according to court documents given to his wife on Monday .\nWith attorneys locked up , harassed or plain scared , two prominent dissidents will defend themselves against charges of subversion Thursday in China 's highest-profile dissident trials in two years .\n"
        references_2 = "Hurricane Mitch, category 5 hurricane, brought widespread death and destruction to Central American.\nEspecially hard hit was Honduras where an estimated 6,076 people lost their lives.\nThe hurricane, which lingered off the coast of Honduras for 3 days before moving off, flooded large areas, destroying crops and property.\nThe U.S. and European Union were joined by Pope John Paul II in a call for money and workers to help the stricken area.\nPresident Clinton sent Tipper Gore, wife of Vice President Gore to the area to deliver much needed supplies to the area, demonstrating U.S. commitment to the recovery of the region.\n"

        all_hypothesis = [hypothesis_1, hypothesis_2]
        all_references = [references_1, references_2]

        scores = evaluator.get_scores(candidate, reference)

        for metric, results in sorted(scores.items(), key=lambda x: x[0]):
            if not apply_avg and not apply_best:  # value is a type of list as we evaluate each summary vs each reference
                for hypothesis_id, results_per_ref in enumerate(results):
                    nb_references = len(results_per_ref['p'])
                    for reference_id in range(nb_references):
                        pass
                #         print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                #         print('\t' + prepare_results(metric, results_per_ref['p'][reference_id],
                #                                      results_per_ref['r'][reference_id],
                #                                      results_per_ref['f'][reference_id]))
                # print()
            else:
                print(
                    prepare_results(metric, results['p'], results['r'],
                                    results['f']))
        print()
Exemple #24
0
    def setUp(self):
        self.hyp_path = 'predicted_sentences_task1_ref0_2.txt'
        self.ref_path = 'task1_ref0_2.txt'

        self.data_path = 'data2.json'
        with open(self.data_path) as f:
            self.data = json.load(f)

        self.rouge = rouge.Rouge()
        self.files_rouge = rouge.FilesRouge()
 def rouge(self):
     """ Computes ROUGE
         Link: https://github.com/Diego999/py-rouge """
     evaluator = rouge_score.Rouge(metrics=["rouge-n"], max_n=1)
     rouge = {
         "ROUGE":
         evaluator.get_scores(self.hypothesis, self.target)["rouge-1"]["f"]
         * 100
     }
     self.metrics.update(rouge)
Exemple #26
0
    def setUp(self):
        self.hyp_path = './tests/hyp.txt'
        self.ref_path = './tests/ref.txt'

        self.data_path = './tests/data.json'
        with open(self.data_path) as f:
            self.data = json.load(f)

        self.rouge = rouge.Rouge()
        self.files_rouge = rouge.FilesRouge()
    def on_epoch_end(self, epoch, logs=None):
        # 训练过程中观察一两个例子
        print_sentence(s1, self.model)
        print_sentence(s2, self.model)

        if epoch >= 7:
            predict = []
            for t in test['description'].values:
                predict.append(gen_sent(t, model))

            pred = {i: p for i, p in zip(id, predict)}
            # print(pred)

            with open("submission" + str(epoch) + ".json",
                      "w",
                      encoding='utf-8') as f:
                json.dump(pred, f, indent=2, ensure_ascii=False)

        summarization = test_df['question'].values
        text = test_df['description'].values
        pred = []
        for t, s in tqdm(zip(text, summarization)):
            pred.append(gen_sent(t, self.model))
        print('epoch:' + epoch + ' ,loss:' + logs['loss'])
        log.write('epoch:' + str(epoch) + ' ,loss:' + str(logs['loss']))

        rouge_1 = rouge.Rouge().get_scores(
            pred, summarization.tolist())[0]['rouge-1']['f']
        rouge_2 = rouge.Rouge().get_scores(
            pred, summarization.tolist())[0]['rouge-2']['f']
        rouge_l = rouge.Rouge().get_scores(
            pred, summarization.tolist())[0]['rouge-l']['f']
        print('rouge-1:', rouge_1)
        print('rouge-2:', rouge_2)
        print('rouge-l:', rouge_l)
        log.write('rouge-1:' + str(rouge_1))
        log.write('rouge-2:' + str(rouge_2))
        log.write('rouge-l:' + str(rouge_l))

        # 保存最优模型
        if logs['loss'] <= self.lowest:
            self.lowest = logs['loss']
            model.save_weights('best_model.weights')
def evaluation_metrics(topic_name):
    reference = ''
    global final_summary
    address = os.getcwd() + '/GroundTruth/' + topic_name + '.1'
    file_open = open(address, 'r')
    for line in file_open:
        reference = reference + ''.join(line)
    hypothesis = final_summary
    rouge1 = rouge.Rouge()
    scores = rouge1.get_scores(reference, hypothesis)
    print scores
 def _get_scores_python(hypothesis, references):
     """Note: py-rouge mixes up recall/precision"""
     return rouge.Rouge(
         metrics=["rouge-n", "rouge-l"],
         max_n=2,
         limit_length=False,
         apply_avg=True,
         alpha=0.5,  # Default F1_score
         stemming=True,
         ensure_compatibility=True,
     ).get_scores(hypothesis, references)
Exemple #30
0
def setup_rouge_python(metrics, N, stemming, apply_avg, apply_best, alpha,
                       limit_length, length_type, length_limit, weight_factor):
    return rouge.Rouge(metrics=metrics,
                       max_n=N,
                       limit_length=limit_length,
                       length_limit=length_limit,
                       length_limit_type=length_type,
                       apply_avg=apply_avg,
                       apply_best=apply_best,
                       alpha=alpha,
                       weight_factor=weight_factor,
                       stemming=stemming)