Esempio n. 1
0
def main() -> None:
    args = parse_arguments(subtype="evaluate_shallow_metrics")
    # get verbosity
    if args.verbosity == 1:
        logger = logging.getLogger('base')
    else:
        logger = logging.getLogger('root')
    # define json glob
    json_glob = args.json_glob
    # define search space
    files = glob(json_glob)
    for input_file in files:
        # log information
        logger.info("Computing bleu and chrf scores: %s", input_file)
        # load single dictionary and compute surface similarity scores
        with open(input_file, "r") as f:
            store = json.load(f)
        for key in tqdm(store.keys()):
            source_orig_de = store[key]["sentence_original"]["source"]
            source_para_de = store[key]["sentence_paraphrase"]["source"]
            target_orig_en = store[key]["sentence_original"]["target"]
            target_para_en = store[key]["sentence_paraphrase"]["target"]
            chrf_bar_source = (sacrebleu.sentence_chrf(
                source_orig_de, [source_para_de]).score +
                               sacrebleu.sentence_chrf(
                                   source_para_de, [source_orig_de]).score) / 2
            chrf_bar_target = (sacrebleu.sentence_chrf(
                target_orig_en, [target_para_en]).score +
                               sacrebleu.sentence_chrf(
                                   target_para_en, [target_orig_en]).score) / 2
            bleu_bar_source = (sacrebleu.sentence_bleu(
                source_orig_de, [source_para_de]).score +
                               sacrebleu.sentence_bleu(
                                   source_para_de, [source_orig_de]).score) / 2
            bleu_bar_target = (sacrebleu.sentence_bleu(
                target_orig_en, [target_para_en]).score +
                               sacrebleu.sentence_bleu(
                                   target_para_en, [target_orig_en]).score) / 2
            store[key]["chrf_bar_source"] = chrf_bar_source
            store[key]["chrf_bar_target"] = chrf_bar_target
            store[key]["bleu_bar_source"] = bleu_bar_source
            store[key]["bleu_bar_target"] = bleu_bar_target
            store[key]["chrf_bar_mean"] = (chrf_bar_source +
                                           chrf_bar_target) / 2
            store[key]["bleu_bar_mean"] = (bleu_bar_source +
                                           bleu_bar_target) / 2
        # write back json to disk
        with open(input_file, "w") as f:
            store = json.dump(store, f, ensure_ascii=False)
 def evaluate_example(self, summary, reference):
     score = sacrebleu.sentence_chrf(summary,
                                     reference,
                                     order=self.ncorder,
                                     beta=self.beta)
     score_dict = {"chrf": score.score}
     return score_dict
Esempio n. 3
0
 def evaluate_example(self, summary, reference):
     if not isinstance(reference, list):
         reference = [reference]
     score = sacrebleu.sentence_chrf(summary,
                                     reference,
                                     order=self.ncorder,
                                     beta=self.beta)
     score_dict = {"chrf": score.score}
     return score_dict
def run_sentence_chrf(candidates: list, references: list) -> list:
    """ Runs sentence chrF from Sacrebleu. """
    assert len(candidates) == len(references)
    chrf_scores = []
    for i in tqdm(range(len(candidates)), desc="Running chrF..."):
        chrf_scores.append(
            sentence_chrf(hypothesis=candidates[i],
                          reference=references[i]).score)
    return chrf_scores
Esempio n. 5
0
 def score_sentence(self, hyp, ref, lang=None):
     return sacrebleu.sentence_chrf(hyp, ref)
Esempio n. 6
0
    def _analyze(
            self,
            inputs_list: List[lit_types.JsonDict]) -> List[lit_types.JsonDict]:
        features_list: List[str] = list(
            map(lambda input: encode_sp(self.src_spp, input["src_text"]),
                inputs_list))
        infer_config: dict = self.config["infer"]
        dataset = make_inference_dataset(
            self.model,
            features_list,
            infer_config["batch_size"],
            length_bucket_width=infer_config["length_bucket_width"],
            prefetch_buffer_size=infer_config.get("prefetch_buffer_size"),
        )

        if self._analyze_fn is None:
            self._analyze_fn = tf.function(
                self.model.analyze, input_signature=(dataset.element_spec, ))
            if not tf.config.functions_run_eagerly():
                tf.get_logger().info(
                    "Tracing and optimizing the analyze graph...")
                self._analyze_fn.get_concrete_function(
                )  # Trace the function now.

        results: List[lit_types.JsonDict] = [None] * len(features_list)
        for features in dataset:
            predictions = self._analyze_fn(features)

            top_k_probs, top_k_ids = tf.nn.top_k(tf.nn.softmax(
                predictions["logits"]),
                                                 k=10)
            del predictions["logits"]
            predictions["top_k_probs"] = top_k_probs
            predictions["top_k_ids"] = top_k_ids

            masks = tf.sequence_mask(features["length"],
                                     maxlen=tf.shape(features["ids"])[1])
            predictions["encoder_final_embedding"] = masked_token_mean(
                predictions["encoder_outputs"], masks)
            del predictions["encoder_outputs"]

            predictions = tf.nest.map_structure(lambda t: t.numpy(),
                                                predictions)
            for prediction in extract_batches(predictions):
                index: int = prediction["index"]
                target_length = prediction["length"]
                trg_tokens = prediction["tokens"][:target_length]
                tok_trg_text = self.model.labels_inputter.tokenizer.detokenize(
                    trg_tokens)
                trg_text = decode_sp(tok_trg_text)
                attention = prediction["alignment"][:target_length]
                probs = prediction["top_k_probs"]
                ids = prediction["top_k_ids"]
                pred_tokens = list(
                    self._convert_top_k(ids, probs, target_length))
                encoder_final_embedding = prediction["encoder_final_embedding"]
                ref_text = inputs_list[index]["ref_text"]
                tok_ref_text = encode_sp(self.trg_spp, ref_text)
                ter_score = sacrebleu.sentence_ter(tok_trg_text,
                                                   [tok_ref_text])
                chrf_score = sacrebleu.sentence_chrf(trg_text, [ref_text],
                                                     order=3)
                results[index] = {
                    "trg_tokens": [t.decode("utf-8") for t in trg_tokens],
                    "trg_text": trg_text,
                    "attention": np.expand_dims(attention, axis=0),
                    "src_tokens": features_list[index].split(),
                    "pred_tokens": pred_tokens,
                    "encoder_final_embedding": encoder_final_embedding,
                    "ter": ter_score.score,
                    "chrf3": chrf_score.score,
                }
        return results
Esempio n. 7
0
    prev = [' '.join(o) for o in pred_sys_stream]

    # choose one (gold or pred) and postprocess
    sys_stream = pred_sys_stream
    sys_stream = [
        pp.post_process(o, abstract[i], graph[i])
        for i, o in enumerate(sys_stream)
    ]

    bleu = sacrebleu.corpus_bleu(sys_stream,
                                 ref_streams,
                                 force=True,
                                 lowercase=True,
                                 tokenize='none').score
    chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream)
    all_sent_chrf = [
        sacrebleu.sentence_chrf(x, y) for x, y in zip(sys_stream, ref_stream)
    ]
    avg_sent_chrf = sum(all_sent_chrf) / len(all_sent_chrf)
    if args.output:
        with open(args.pred_file + '.final', 'w') as fo:
            for x in sys_stream:
                fo.write(x + '\n')

        with open(args.pred_file + '.ref', 'w') as fo:
            for x in ref_stream:
                fo.write(x + '\n')
    print(avg_sent_chrf)
    print(bleu, chrf)
Esempio n. 8
0
def _get_sent_chrf(hypothesis: List[str],
                   references: List[List[str]],
                   extra_args: Optional[Dict[str, str]] = None):
    return [
        sb.sentence_chrf(h, r).score for h, r in zip(hypothesis, references[0])
    ]
Esempio n. 9
0
def compute_chrf(references, translation):
    hypo = ' '.join(translation)
    refs = [' '.join(r) for r in references][0]
    return sacrebleu.sentence_chrf(hypo, refs).score
Esempio n. 10
0
 def chrf(self):
     """
     ChRF of the hypothesis
     """
     return sacrebleu.sentence_chrf(self.new_hyp.replace(' ', '').replace('</s>', '').replace('▁', ' '), [self.cur_ref]).score
Esempio n. 11
0
def test_chrf_sentence_level(hypothesis, references, expected_score):
    score = sacrebleu.sentence_chrf(hypothesis, references,
                                    eps_smoothing=True).score
    assert abs(score - expected_score) < EPSILON
Esempio n. 12
0
def sentchrf(y_hat, y):
    return sentence_chrf(y_hat, y) * 100
Esempio n. 13
0
        if '<generate>' in hyp2[i]:
            hyp2[i] = hyp2[i].split('<generate>')[-1]
        hyp2[i] = tokenize_sentence(hyp2[i].lower())

    # Run evaluation
    print("BLEU hyp1", round(raw_corpus_bleu(hyp1, [ref]), 2))
    print("chrF++ hyp1", round(raw_corpus_chrf(hyp1, ref).score * 100, 2))

    print("BLEU hyp2", round(raw_corpus_bleu(hyp2, [ref]), 2))
    print("chrF++ hyp2", round(raw_corpus_chrf(hyp2, ref).score * 100, 2))

    h1_sent_scores = list()
    h2_sent_scores = list()

    for r, h1, h2 in zip(ref, hyp1, hyp2):
        h1_sent_scores.append(sacrebleu.sentence_bleu(h1, r).score)
        h2_sent_scores.append(sacrebleu.sentence_bleu(h2, r).score)

    t = ttest_ind(h1_sent_scores, h2_sent_scores)
    print("BLEU P value:", "{:.20f}".format(t[1]))

    h1_sent_scores = list()
    h2_sent_scores = list()

    for r, h1, h2 in zip(ref, hyp1, hyp2):
        h1_sent_scores.append(sacrebleu.sentence_chrf(h1, r).score)
        h2_sent_scores.append(sacrebleu.sentence_chrf(h2, r).score)

    t = ttest_ind(h1_sent_scores, h2_sent_scores)
    print("ChrF++ P value:", "{:.20f}".format(t[1]))
Esempio n. 14
0
dataframe = pd.DataFrame(columns=columns)
for i in range(len(source)):
    BLEU_mmt_b = float("{:.3f}".format(
        sentence_bleu(base_MMT[i], [ref[i]], smooth_method='exp').score))
    BLEU_mmt_c_I = float("{:.3f}".format(
        sentence_bleu(mmt_c_I[i], [ref[i]], smooth_method='exp').score))
    BLEU_mmt_c_II = float("{:.3f}".format(
        sentence_bleu(mmt_c_II[i], [ref[i]], smooth_method='exp').score))
    diff_BLEU_I = float(
        "{:.3f}".format(float(BLEU_mmt_c_I) - float(BLEU_mmt_b)))
    diff_BLEU_II = float(
        "{:.3f}".format(float(BLEU_mmt_c_II) - float(BLEU_mmt_b)))
    diff_BLEU_I_II = float(
        "{:.3f}".format(float(BLEU_mmt_c_II) - float(BLEU_mmt_c_I)))
    chrF3_mmt_b = float("{:.3f}".format(
        sentence_chrf(base_MMT[i], [ref[i]]).score))
    chrF3_mmt_c_I = float("{:.3f}".format(
        sentence_chrf(mmt_c_I[i], [ref[i]], beta=3).score))
    chrF3_mmt_c_II = float("{:.3f}".format(
        sentence_chrf(mmt_c_II[i], [ref[i]], beta=3).score))
    diff_chrF3_I = float("{:.3f}".format(chrF3_mmt_c_I - chrF3_mmt_b))
    diff_chrF3_II = float("{:.3f}".format(chrF3_mmt_c_II - chrF3_mmt_b))
    diff_chrF3_I_II = float("{:.3f}".format(chrF3_mmt_c_II - chrF3_mmt_c_I))
    hlepor_mmt_b = float("{:.3f}".format(
        single_hlepor_score(ref[i], base_MMT[i])))
    hlepor_mmt_c_I = float("{:.3f}".format(
        single_hlepor_score(ref[i], mmt_c_I[i])))
    hlepor_mmt_c_II = float("{:.3f}".format(
        single_hlepor_score(ref[i], mmt_c_II[i])))
    diff_hlepor_I = float("{:.3f}".format(hlepor_mmt_c_I - hlepor_mmt_b))
    diff_hlepor_II = float("{:.3f}".format(hlepor_mmt_c_II - hlepor_mmt_b))