def main() -> None: args = parse_arguments(subtype="evaluate_shallow_metrics") # get verbosity if args.verbosity == 1: logger = logging.getLogger('base') else: logger = logging.getLogger('root') # define json glob json_glob = args.json_glob # define search space files = glob(json_glob) for input_file in files: # log information logger.info("Computing bleu and chrf scores: %s", input_file) # load single dictionary and compute surface similarity scores with open(input_file, "r") as f: store = json.load(f) for key in tqdm(store.keys()): source_orig_de = store[key]["sentence_original"]["source"] source_para_de = store[key]["sentence_paraphrase"]["source"] target_orig_en = store[key]["sentence_original"]["target"] target_para_en = store[key]["sentence_paraphrase"]["target"] chrf_bar_source = (sacrebleu.sentence_chrf( source_orig_de, [source_para_de]).score + sacrebleu.sentence_chrf( source_para_de, [source_orig_de]).score) / 2 chrf_bar_target = (sacrebleu.sentence_chrf( target_orig_en, [target_para_en]).score + sacrebleu.sentence_chrf( target_para_en, [target_orig_en]).score) / 2 bleu_bar_source = (sacrebleu.sentence_bleu( source_orig_de, [source_para_de]).score + sacrebleu.sentence_bleu( source_para_de, [source_orig_de]).score) / 2 bleu_bar_target = (sacrebleu.sentence_bleu( target_orig_en, [target_para_en]).score + sacrebleu.sentence_bleu( target_para_en, [target_orig_en]).score) / 2 store[key]["chrf_bar_source"] = chrf_bar_source store[key]["chrf_bar_target"] = chrf_bar_target store[key]["bleu_bar_source"] = bleu_bar_source store[key]["bleu_bar_target"] = bleu_bar_target store[key]["chrf_bar_mean"] = (chrf_bar_source + chrf_bar_target) / 2 store[key]["bleu_bar_mean"] = (bleu_bar_source + bleu_bar_target) / 2 # write back json to disk with open(input_file, "w") as f: store = json.dump(store, f, ensure_ascii=False)
def evaluate_example(self, summary, reference): score = sacrebleu.sentence_chrf(summary, reference, order=self.ncorder, beta=self.beta) score_dict = {"chrf": score.score} return score_dict
def evaluate_example(self, summary, reference): if not isinstance(reference, list): reference = [reference] score = sacrebleu.sentence_chrf(summary, reference, order=self.ncorder, beta=self.beta) score_dict = {"chrf": score.score} return score_dict
def run_sentence_chrf(candidates: list, references: list) -> list: """ Runs sentence chrF from Sacrebleu. """ assert len(candidates) == len(references) chrf_scores = [] for i in tqdm(range(len(candidates)), desc="Running chrF..."): chrf_scores.append( sentence_chrf(hypothesis=candidates[i], reference=references[i]).score) return chrf_scores
def score_sentence(self, hyp, ref, lang=None): return sacrebleu.sentence_chrf(hyp, ref)
def _analyze( self, inputs_list: List[lit_types.JsonDict]) -> List[lit_types.JsonDict]: features_list: List[str] = list( map(lambda input: encode_sp(self.src_spp, input["src_text"]), inputs_list)) infer_config: dict = self.config["infer"] dataset = make_inference_dataset( self.model, features_list, infer_config["batch_size"], length_bucket_width=infer_config["length_bucket_width"], prefetch_buffer_size=infer_config.get("prefetch_buffer_size"), ) if self._analyze_fn is None: self._analyze_fn = tf.function( self.model.analyze, input_signature=(dataset.element_spec, )) if not tf.config.functions_run_eagerly(): tf.get_logger().info( "Tracing and optimizing the analyze graph...") self._analyze_fn.get_concrete_function( ) # Trace the function now. results: List[lit_types.JsonDict] = [None] * len(features_list) for features in dataset: predictions = self._analyze_fn(features) top_k_probs, top_k_ids = tf.nn.top_k(tf.nn.softmax( predictions["logits"]), k=10) del predictions["logits"] predictions["top_k_probs"] = top_k_probs predictions["top_k_ids"] = top_k_ids masks = tf.sequence_mask(features["length"], maxlen=tf.shape(features["ids"])[1]) predictions["encoder_final_embedding"] = masked_token_mean( predictions["encoder_outputs"], masks) del predictions["encoder_outputs"] predictions = tf.nest.map_structure(lambda t: t.numpy(), predictions) for prediction in extract_batches(predictions): index: int = prediction["index"] target_length = prediction["length"] trg_tokens = prediction["tokens"][:target_length] tok_trg_text = self.model.labels_inputter.tokenizer.detokenize( trg_tokens) trg_text = decode_sp(tok_trg_text) attention = prediction["alignment"][:target_length] probs = prediction["top_k_probs"] ids = prediction["top_k_ids"] pred_tokens = list( self._convert_top_k(ids, probs, target_length)) encoder_final_embedding = prediction["encoder_final_embedding"] ref_text = inputs_list[index]["ref_text"] tok_ref_text = encode_sp(self.trg_spp, ref_text) ter_score = sacrebleu.sentence_ter(tok_trg_text, [tok_ref_text]) chrf_score = sacrebleu.sentence_chrf(trg_text, [ref_text], order=3) results[index] = { "trg_tokens": [t.decode("utf-8") for t in trg_tokens], "trg_text": trg_text, "attention": np.expand_dims(attention, axis=0), "src_tokens": features_list[index].split(), "pred_tokens": pred_tokens, "encoder_final_embedding": encoder_final_embedding, "ter": ter_score.score, "chrf3": chrf_score.score, } return results
prev = [' '.join(o) for o in pred_sys_stream] # choose one (gold or pred) and postprocess sys_stream = pred_sys_stream sys_stream = [ pp.post_process(o, abstract[i], graph[i]) for i, o in enumerate(sys_stream) ] bleu = sacrebleu.corpus_bleu(sys_stream, ref_streams, force=True, lowercase=True, tokenize='none').score chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream) all_sent_chrf = [ sacrebleu.sentence_chrf(x, y) for x, y in zip(sys_stream, ref_stream) ] avg_sent_chrf = sum(all_sent_chrf) / len(all_sent_chrf) if args.output: with open(args.pred_file + '.final', 'w') as fo: for x in sys_stream: fo.write(x + '\n') with open(args.pred_file + '.ref', 'w') as fo: for x in ref_stream: fo.write(x + '\n') print(avg_sent_chrf) print(bleu, chrf)
def _get_sent_chrf(hypothesis: List[str], references: List[List[str]], extra_args: Optional[Dict[str, str]] = None): return [ sb.sentence_chrf(h, r).score for h, r in zip(hypothesis, references[0]) ]
def compute_chrf(references, translation): hypo = ' '.join(translation) refs = [' '.join(r) for r in references][0] return sacrebleu.sentence_chrf(hypo, refs).score
def chrf(self): """ ChRF of the hypothesis """ return sacrebleu.sentence_chrf(self.new_hyp.replace(' ', '').replace('</s>', '').replace('▁', ' '), [self.cur_ref]).score
def test_chrf_sentence_level(hypothesis, references, expected_score): score = sacrebleu.sentence_chrf(hypothesis, references, eps_smoothing=True).score assert abs(score - expected_score) < EPSILON
def sentchrf(y_hat, y): return sentence_chrf(y_hat, y) * 100
if '<generate>' in hyp2[i]: hyp2[i] = hyp2[i].split('<generate>')[-1] hyp2[i] = tokenize_sentence(hyp2[i].lower()) # Run evaluation print("BLEU hyp1", round(raw_corpus_bleu(hyp1, [ref]), 2)) print("chrF++ hyp1", round(raw_corpus_chrf(hyp1, ref).score * 100, 2)) print("BLEU hyp2", round(raw_corpus_bleu(hyp2, [ref]), 2)) print("chrF++ hyp2", round(raw_corpus_chrf(hyp2, ref).score * 100, 2)) h1_sent_scores = list() h2_sent_scores = list() for r, h1, h2 in zip(ref, hyp1, hyp2): h1_sent_scores.append(sacrebleu.sentence_bleu(h1, r).score) h2_sent_scores.append(sacrebleu.sentence_bleu(h2, r).score) t = ttest_ind(h1_sent_scores, h2_sent_scores) print("BLEU P value:", "{:.20f}".format(t[1])) h1_sent_scores = list() h2_sent_scores = list() for r, h1, h2 in zip(ref, hyp1, hyp2): h1_sent_scores.append(sacrebleu.sentence_chrf(h1, r).score) h2_sent_scores.append(sacrebleu.sentence_chrf(h2, r).score) t = ttest_ind(h1_sent_scores, h2_sent_scores) print("ChrF++ P value:", "{:.20f}".format(t[1]))
dataframe = pd.DataFrame(columns=columns) for i in range(len(source)): BLEU_mmt_b = float("{:.3f}".format( sentence_bleu(base_MMT[i], [ref[i]], smooth_method='exp').score)) BLEU_mmt_c_I = float("{:.3f}".format( sentence_bleu(mmt_c_I[i], [ref[i]], smooth_method='exp').score)) BLEU_mmt_c_II = float("{:.3f}".format( sentence_bleu(mmt_c_II[i], [ref[i]], smooth_method='exp').score)) diff_BLEU_I = float( "{:.3f}".format(float(BLEU_mmt_c_I) - float(BLEU_mmt_b))) diff_BLEU_II = float( "{:.3f}".format(float(BLEU_mmt_c_II) - float(BLEU_mmt_b))) diff_BLEU_I_II = float( "{:.3f}".format(float(BLEU_mmt_c_II) - float(BLEU_mmt_c_I))) chrF3_mmt_b = float("{:.3f}".format( sentence_chrf(base_MMT[i], [ref[i]]).score)) chrF3_mmt_c_I = float("{:.3f}".format( sentence_chrf(mmt_c_I[i], [ref[i]], beta=3).score)) chrF3_mmt_c_II = float("{:.3f}".format( sentence_chrf(mmt_c_II[i], [ref[i]], beta=3).score)) diff_chrF3_I = float("{:.3f}".format(chrF3_mmt_c_I - chrF3_mmt_b)) diff_chrF3_II = float("{:.3f}".format(chrF3_mmt_c_II - chrF3_mmt_b)) diff_chrF3_I_II = float("{:.3f}".format(chrF3_mmt_c_II - chrF3_mmt_c_I)) hlepor_mmt_b = float("{:.3f}".format( single_hlepor_score(ref[i], base_MMT[i]))) hlepor_mmt_c_I = float("{:.3f}".format( single_hlepor_score(ref[i], mmt_c_I[i]))) hlepor_mmt_c_II = float("{:.3f}".format( single_hlepor_score(ref[i], mmt_c_II[i]))) diff_hlepor_I = float("{:.3f}".format(hlepor_mmt_c_I - hlepor_mmt_b)) diff_hlepor_II = float("{:.3f}".format(hlepor_mmt_c_II - hlepor_mmt_b))