def eval_measure(gold, sys, eval_type='acc'): ''' Evaluation measure This takes in gold labels and system outputs and evaluates their accuracy. It currently supports: * Accuracy (acc), percentage of labels that match * Pearson's correlation coefficient (pearson) * BLEU score (bleu) * BLEU_detok, on detokenized references and translations, with internal tokenization :param gold: the correct labels :param sys: the system outputs :param eval_type: The type of evaluation to do (acc, pearson, bleu, bleu_detok) ''' if eval_type == EVAL_TYPE_ACC: return sum([1 if g == s else 0 for g, s in zip(gold, sys)]) / float(len(gold)) elif eval_type == EVAL_TYPE_BLEU: import nltk gold_wrap = [[x] for x in gold] return nltk.translate.bleu_score.corpus_bleu(gold_wrap, sys) elif eval_type == EVAL_TYPE_PEARSON: return np.corrcoef([gold, sys])[0,1] elif eval_type == EVAL_TYPE_BLEU_DETOK: import sacrebleu # make sure score is 0-based instead of 100-based return sacrebleu.corpus_bleu(sys, [gold]).score / 100. else: raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)
def score_batch(self, hypotheses: List[List[str]], references: List[List[str]]) -> float: hyp_joined = [" ".join(hyp) for hyp in hypotheses] ref_joined = [" ".join(ref) for ref in references] bleu = corpus_bleu(hyp_joined, [ref_joined], smooth_method=self.smooth_method, smooth_value=self.smooth_value, force=self.force, lowercase=self.lowercase, tokenize=self.tokenize, use_effective_order=self.use_effective_order) return bleu.score
def bleu(targets, predictions): predictions = [tf.compat.as_text(x) for x in predictions] if isinstance(targets[0], list): targets = [[tf.compat.as_text(x) for x in target] for target in targets] else: targets = [tf.compat.as_text(x) for x in targets] targets = [targets] bleu_score = corpus_bleu(predictions, targets, smooth_method="exp", smooth_value=0.0, force=False, lowercase=False, tokenize="ja", use_effective_order=False) return {"bleu": bleu_score.score}
def get_score_by_task(task): results = {} for lg in task2lg[task].split(): preds = [] labels = [] guid = 0 guids = [] if task == "MLQA": datasets, preds = load_mlqa_data(task, lg) elif task in ["NC", "QADSM", "QAM", "WPR"]: preds, labels, guids = load_data(task, lg) elif task in ["QG", "NTG"]: preds, labels = load_qg_ntg_data(task, lg) elif task in ["NER", "POS"]: preds, labels = load_ner_pos_data(task, lg) elif task == "XNLI": labels, preds = load_xnli_data(task, lg) elif task == "PAWSX": labels, preds = load_pawsx_data(task, lg) if task == "MLQA": results[lg] = mlqa_evaluate(datasets["data"], preds, lg)["f1"] / 100 #Normalize elif task == "NER": results[lg] = f1_score(labels, preds) elif task == "POS": results[lg] = precision_score(labels, preds) elif task in ["NC", "XNLI", "PAWSX", "QADSM", "QAM"]: results[lg] = simple_accuracy(preds, labels) elif task == "WPR": results[lg] = simple_ndcg(preds, labels, guids) elif task == "QG" or task == "NTG": results[lg] = sacrebleu.corpus_bleu( preds, [labels], lowercase=True).score / 100 #Normalize avg = 0 count = 0 for key in results.keys(): avg += results[key] count += 1 avg /= count results["avg"] = avg return results
def fetch_translations_and_bleu(model:nn.Module, dataset:DataLoader, tokenizer:Tokenizer, iterations=10, k=1, num_samples=6000): """ BLEU keeping number of samples in training and validation same """ model.eval() device = torch.device("cpu") if torch.cuda.is_available(): device = torch.device("cuda") model.cuda() pred = [] tgt = [] src = [] for batch in tqdm(dataset, desc="predicting ... ", leave=False): for k in batch: batch[k] = batch[k].to(device) out = model.generate(**batch, iterations=iterations, tokenizer=tokenizer, k=k) pred.extend(out["tgt_text"]) src.extend(tokenizer.batch_decode(batch["input_ids"], is_src_txt=True)) tgt.extend(tokenizer.batch_decode(batch["labels"], is_tgt_txt=True)) if len(pred) > num_samples: break # bleu score bleu = corpus_bleu(pred, [tgt]).score return { "bleu": bleu, "src": src, "tgt": tgt, "pred": pred }
def cal_bleu(samples, task, args): tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): return (x + ' ').replace('@@ ', '').rstrip() tgt_dict = task.target_dictionary target_tensor = samples['target'] cand_tensor = samples['cand'] batch_bleu = [] assert len(target_tensor) == len(cand_tensor) for i in range(len(target_tensor)): tgt_tokens = utils.strip_pad(target_tensor[i], tgt_dict.pad()).int().cpu() cand_tokens = utils.strip_pad(cand_tensor[i], tgt_dict.pad()).int().cpu() tgt_str = tgt_dict.string(tgt_tokens, None, escape_unk=True, extra_symbols_to_ignore={tgt_dict.eos()}) tgt_str = decode_fn(tgt_str) cand_str = tgt_dict.string(cand_tokens, None, escape_unk=True, extra_symbols_to_ignore={tgt_dict.eos()}) cand_str = decode_fn(cand_str) bleuscore = sacrebleu.corpus_bleu([cand_str], [[tgt_str]], use_effective_order=True) batch_bleu.append(bleuscore.score) samples['bleu'] = batch_bleu assert len(batch_bleu) == len(target_tensor) return samples
def _compute_score(self): """Computes sacreBLEU score for current submission.""" sgml_path = str(self.sgml_file.name) text_path = sgml_path.replace('.sgm', '.txt') ref_path = Path(self.test_set.reference_file.name) from sacrebleu import process_to_text, corpus_bleu # Extract raw text from SGML file if not Path(text_path).exists(): process_to_text(sgml_path, text_path) hyp_stream = [x for x in open(text_path, encoding='utf-8')] ref_stream = [r for r in open(ref_path, encoding='utf-8')] bleu = corpus_bleu(hyp_stream, [ref_stream]) self.score = bleu.score self.save()
def bleu_score(self): hypotheses = [] for prompt in self.pred: d = self.pred[prompt] pred_lst = sorted(d.items(), key=lambda item: item[1], reverse=True) hypotheses.append(pred_lst[0][0]) references = [] for prompt in self.gold: d = self.gold[prompt] ref_lst = sorted(d.items(), key=lambda item: item[1], reverse=True) ref_lst = [item[0] for item in ref_lst] references.append(ref_lst) references_t = [ list(ref_t) for ref_t in zip_longest(*references, fillvalue='') ] bleu = sacrebleu.corpus_bleu(hypotheses, references_t) return bleu.score
def sacre_bleu(self, hypo, groundtruth=None, lc=False): if groundtruth is None: ref = self._refs_for_sacre else: if isinstance(groundtruth[0], str): ref = [groundtruth] else: ref = groundtruth try: bleu = sacrebleu.corpus_bleu( hypo, ref, lowercase=lc, tokenize=self._sacre_tokenize_str) return bleu.score except IndexError: logging.info("Found empty lines.") print(traceback.format_exc()) bleu = 0. except ZeroDivisionError: logging.info("Empty reference") print(traceback.format_exc()) return 0.
def bleu(corpus, truths): ''' corpus: list, NBs * BATCHSIZE * MAX_LEN truths: list, NBs * BATCHSIZE * MAX_LEN return: array of length NBs, avg blue score for each batch ''' n = len(corpus) bleus = [0]*n for i in range(n): pred, true = corpus[i], truths[i] sumbleu = 0.0 for j in range(len(corpus[i])): pred_tensor, true_tensor = pred[j], true[j] pred_sent, true_sent = convert_idx_2_sent(pred_tensor, true_tensor, target_tra) sumbleu += corpus_bleu(true_sent, pred_sent).score avgbleu = sumbleu / len(corpus[i]) bleus[i] = avgbleu return bleus
def _compute_score(self): """Computes sacreBLEU score for current submission.""" sgml_path = str(self.sgml_file.name) text_path = sgml_path.replace('.sgm', '.txt') ref_path = 'testsets/wmt18.ende.ref.txt' from sacrebleu import process_to_text, corpus_bleu from pathlib import Path if not Path(text_path).exists(): process_to_text(sgml_path, text_path) hyp_stream = [x for x in open(text_path, encoding='utf-8')] ref_stream = [r for r in open(ref_path, encoding='utf-8')] bleu = corpus_bleu(hyp_stream, [ref_stream]) self.score = bleu.score self.save()
def eval_moses_bleu(ref, hyp): """ Given a file of hypothesis and reference files, evaluate the BLEU score using Moses scripts. """ assert os.path.isfile(hyp) assert os.path.isfile(ref) or os.path.isfile(ref + '0') hyps, refs = [], [] with open(hyp) as fh, open(ref) as rh: for line in fh: hyps.append(line.strip()) for line in rh: refs.append(line.strip()) score = sacrebleu.corpus_bleu(hyps, [refs], tokenize='none').score return score
def bs_test(encoder, decoder, data_loader, beam_k, max_length, train_input_lang, train_output_lang): count = 0 candidate_corpus = [] reference_corpus = [] for i, (input, input_len, target, target_len) in enumerate(data_loader): decoded_words = bs_evaluate(encoder, decoder, input, beam_k, max_length) candidate_sentences = [] for ind in range(decoded_words.shape[0]): sent_words = [] for token in decoded_words[ind]: if token != PAD_token and token != EOS_token: sent_words.append(train_output_lang.index2word[token]) else: break sent_words = ' '.join(sent_words) # print the first sentence in the first batch to peek the translation result if count == 0: print('predict: '+sent_words) count += 1 candidate_sentences.append(sent_words) candidate_corpus.extend(candidate_sentences) reference_sentences = [] for sent in target: sent_words = [] for token in sent: if token.item() != EOS_token: sent_words.append(train_output_lang.index2word[token.item()]) else: break sent_words = ' '.join(sent_words) if count == 1: print('target: '+sent_words) count += 1 reference_sentences.append(sent_words) reference_corpus.extend(reference_sentences) score = corpus_bleu(candidate_corpus, [reference_corpus], smooth='exp', smooth_floor=0.0, force=False).score return score
def evaluatebleu(encoder1, decoder1, loader, with_o=True): score = 0 output_words = [] true_words = [] for i, (candidate, length_1, reference, length_2) in enumerate(loader): #print(i) if with_o == True: max_length = max(length_2).item() output_words += evaluate(encoder1, decoder1, candidate, length_1, reference, length_2, max_length) else: output_words, attentions = evaluate(encoder, decoder, pair[0]) true_words += mapback(reference) score = sacrebleu.corpus_bleu(output_words, [true_words]) print(output_words[0]) print(true_words[0]) return (score, output_words, true_words)
def write_evals(writer, experiment, translation, file_path, ref, src): writer = SummaryWriter("runs/{}-{}".format(experiment, translation)) steps = int(translation) output_path = "translations/{}/{}".format(experiment, translation) with open(output_path, "r", encoding="utf-8") as infile: system_output = [x.strip() for x in infile.readlines()] bleu = sacrebleu.corpus_bleu(system_output, [ref]) chrf = sacrebleu.corpus_chrf(system_output, [ref]) rhyme_score, copied, reconstructed = concurrent_score(system_output, languages[experiment], ref, src) print(experiment, translation, bleu.score, rhyme_score, copied, reconstructed) wall = os.stat(file_path).st_mtime writer.add_scalar(experiment + "/CHRF", chrf.score, global_step=steps, walltime=wall) writer.add_scalar(experiment + "/BLEU", bleu.score, global_step=steps, walltime=wall) writer.add_scalar(experiment + "/Rhyme", rhyme_score, global_step=steps, walltime=wall) writer.add_scalar(experiment + "/Copied", copied, global_step=steps, walltime=wall) writer.add_scalar(experiment + "/Reconstructed", reconstructed, global_step=steps, walltime=wall) writer.flush()
def evaluate(self, predicts, answers): """ import sacrebleu refs = [['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'], ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.']] sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.'] bleu = sacrebleu.corpus_bleu(sys, refs) bleu.score 48.530827009929865 """ try: bleu = sacrebleu.corpus_bleu(predicts, answers, lowercase=True) except EOFError: print('# preds', len(predicts)) print('# tgts', len(answers)) exit() return {'BLEU': bleu.score}
def validation_epoch_end(self, outputs) -> None: unpad = self.trainer.datamodule.unpad bpe = yttm.BPE(self.bpe_file) loss = torch.stack([o[0] for o in outputs]).mean() acc = self.val_acc.compute().item() self.val_acc.reset() x = bpe.decode([l for o in outputs for l in unpad(o[1])]) hyp = bpe.decode([l for o in outputs for l in unpad(o[2])]) y_true = bpe.decode([l for o in outputs for l in unpad(o[3])]) bleu = sacrebleu.corpus_bleu(hyp, [y_true]).score self.log('val/loss', loss, True) self.log('val/acc', acc, True) self.log('val/bleu', bleu, True) nni.report_intermediate_result({'bleu': bleu}) # save files self.save_file(f'val.{self.src_lang}', x) self.save_file(f'val.{self.trg_lang}', y_true) self.save_file(f'val.{self.trg_lang}.{self.global_step}.hyp', hyp)
def compute_metrics(ref, hyp, hyp_order): refs = [] hyps = [] for id in hyp_order: for segment in hyp[id]: hyps.append(segment) try: for segment in ref[id]: refs.append(segment) except KeyError: sys.stderr.write('Error: there are no references for document' + ' "' + id + '"\n') sys.exit(-1) try: bleu = sacrebleu.corpus_bleu(hyps, [refs]) chrf = sacrebleu.corpus_chrf(hyps, [refs]) except EOFError: sys.stderr.write('Error: source and reference have different' + ' lengths.\n') sys.exit(-1) return bleu.score, chrf.score
def eval(preds: List[str], refs: List[str]) -> float: """BLEU score computation. Strips all characters belonging to the unicode category "So". Tokenize with standard WMT "13a" tokenizer. Compute 4-BLEU. Args: preds (List[str]): List of translated texts. refs (List[str]): List of target reference texts. """ preds = [OTHERS_PATTERN.sub(" ", text) for text in preds] refs = [OTHERS_PATTERN.sub(" ", text) for text in refs] return ( corpus_bleu(preds, [refs], lowercase=True, tokenize="13a", use_effective_order=False).score, preds, refs, )
def eval_bleu_moses(self, ref_file: str, evaluation_dir: str, sys_file: str): import sacrebleu try: os.makedirs(evaluation_dir) except FileExistsError: logger.warning(evaluation_dir + " is exist") subprocess.run([ f"cat {ref_file} | {MOSES_DETOKENIZER} -l en > {evaluation_dir}/ref.txt" ], shell=True) subprocess.run([ f"cat {sys_file} | {MOSES_DETOKENIZER} -l en > {evaluation_dir}/sys.txt" ], shell=True) with open(f"{evaluation_dir}/ref.txt", 'r+') as file: refs = [file.read().split('\n')] with open(f"{evaluation_dir}/sys.txt", 'r+') as file: sys = file.read().split('\n') bleu = sacrebleu.corpus_bleu(sys, refs) return bleu.score
def bleu(self, refs, preds): """ Returns `t5` style BLEU scores. See the related implementation: https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41 :param refs: A `list` of `list` of reference `str`s. :param preds: A `list` of predicted `str`s. """ score = sacrebleu.corpus_bleu( preds, refs, smooth_method="exp", smooth_value=0.0, force=False, lowercase=False, tokenize="intl", use_effective_order=False, ).score return score
def evaluate(self, result_path): from sacrebleu import download_test_set, corpus_bleu, smart_open assert os.path.exists(result_path) tmp_path = "/tmp/sacrebleu_tmp.txt" self.recover_subwords(result_path, tmp_path) if self.dataset_token is not None: _, *refs = download_test_set(self.dataset_token, self.langpair_token) if not refs: raise SystemError( "Error with dataset_token and langpair_token: {} {}". format(self.dataset_token, self.langpair_token)) refs = [smart_open(x, encoding="utf-8").readlines() for x in refs] else: refs = [self.ref_lines] hyp_lines = open(result_path).readlines() bleu = corpus_bleu(hyp_lines, refs, tokenize=self.tokenizer, lowercase=self.lowercase) return float(bleu.score)
def evaluate_bleu(model, iterator): model.eval() hyp = [] ref = [] for batch in tqdm(iterator): src, trg = batch.src.T, batch.trg.T outputs = search(model, src) outputs = outputs[:, 1:] hyp += get_text_from_tensor(outputs, TRG) ref += get_text_from_tensor(trg, TRG) # expand dim of reference list # sys = ['translation_1', 'translation_2'] # ref = [['truth_1', 'truth_2'], ['another truth_1', 'another truth_2']] ref = [ref] return sacrebleu.corpus_bleu(hyp, ref, force=True).score
def cache_stats(self, ref, out): """ Cache sufficient statistics for caculating SacreBLEU score Args: ref: A reference corpus out: An output corpus Returns: A list of cached statistics """ if self.case_insensitive: ref = corpus_utils.lower(ref) out = corpus_utils.lower(out) cached_stats = [] for r, o in zip(ref, out): re = sacrebleu.corpus_bleu(" ".join(o), " ".join(r)) cached_stats.append( (re.counts, re.totals, re.sys_len, re.ref_len) ) return cached_stats
def eval_with_bleu(self, model, dataloader): import sacrebleu def decode(task, toks, escape_unk=False): toks = toks.tolist() #bos = task.vocab.encode("<s>") #eos = task.vocab.encode("</s>") bos = task.vocab.model.bos_id() eos = task.vocab.model.eos_id() while bos in toks: toks.remove(bos) while eos in toks: toks.remove(eos) s = task.vocab.decode(toks) return s.strip() hyps = [] refs = [] preds = torch.Tensor([self.vocab.model.bos_id()]) for batch in tqdm(dataloader): mask_batch = batch mask_batch['net_input']['prev_output_tokens'] = prev_outputs preds = model(**mask_batch['net_input']) # print(preds[0][0].shape) # print(decode(task,torch.argmax(preds[0][0], dim=1))) # print(decode(task, # utils.strip_pad(batch['target'][0], task.vocab.pad()), # escape_unk=True, # don't count <unk> as matches to the hypo # )) for i in range(preds[0].shape[0]): hyps.append(decode(self, torch.argmax(preds[0][i], dim=1))) refs.append( decode( self, utils.strip_pad(batch['target'][i], self.vocab.pad()), escape_unk= True, # don't count <unk> as matches to the hypo )) return sacrebleu.corpus_bleu(hyps, [refs]), hyps
def get_all_scores( orig_sents: List[str], sys_sents: List[str], refs_sents: List[List[str]], lowercase: bool = False, tokenizer: str = '13a', metrics: List[str] = DEFAULT_METRICS, ): scores = OrderedDict() if 'bleu' in metrics: scores['BLEU'] = corpus_bleu(sys_sents, refs_sents, force=True, tokenize=tokenizer, lowercase=lowercase).score if 'sari' in metrics: scores['SARI'] = corpus_sari(orig_sents, sys_sents, refs_sents, tokenizer=tokenizer, lowercase=lowercase) if 'samsa' in metrics: from easse.samsa import corpus_samsa scores['SAMSA'] = corpus_samsa(orig_sents, sys_sents, tokenizer=tokenizer, verbose=True, lowercase=lowercase) if 'fkgl' in metrics: scores['FKGL'] = corpus_fkgl(sys_sents, tokenizer=tokenizer) quality_estimation_scores = corpus_quality_estimation(orig_sents, sys_sents, tokenizer=tokenizer, lowercase=lowercase) scores = add_dicts( scores, quality_estimation_scores, ) return {key: round(value, 2) for key, value in scores.items()}
def eval_measure(gold, sys, eval_type='bleu'): ''' Evaluation measure This takes in gold labels and system outputs and evaluates their accuracy. It currently supports: * Accuracy (acc), percentage of labels that match * Pearson's correlation coefficient (pearson) * BLEU score (bleu) * BLEU_detok, on detokenized references and translations, with internal tokenization :param gold: the correct labels (reference) :param sys: the system outputs (hypothesis) :param eval_type: The type of evaluation to do (bleu, chrf3, hlepor) ''' if eval_type == EVAL_TYPE_BLEU: # make sure score is 0-based instead of 100-based return corpus_bleu(sys, [gold]).score / 100. elif eval_type == EVAL_TYPE_CHRF3: return corpus_chrf(sys, [gold], beta=3).score elif eval_type == EVAL_TYPE_HLEPOR: return hlepor_score(sys, gold) else: raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)
def cal_score(triple, q, a): """ calculate if the triple(entity, relation, something) appears in the questions and answer """ if len(triple[2]) == 0 or len(triple[3]) == 0: return 0 # something empty like ["异灵灵异-2002", "评论", ""] qa = q + ' ' + a score = 1 if (triple[0].replace(' ', '') in qa.replace(' ', '') or ('天气' == triple[1] and '天气' in q)) else 0 # left entity appears score += check_relation(triple[1], qa) # relation appears if triple[1] == '出生地' and score < 2: score -= 4 # probably not use birthplace knowledege if triple[2] in a: # something directly appears score += 2 else: bleu = sacrebleu.corpus_bleu([a], [[triple[2]]]).score if bleu > 10: score += 2 else: score -= 2 return score
def compute( self, labels: Sequence[Text], preds: Sequence[Text], label_spec: lit_types.TextSegment, pred_spec: lit_types.GeneratedText, config: Optional[lit_types.JsonDict] = None, ) -> Dict[Text, float]: del label_spec del pred_spec del config if not labels or not preds: return {} bleu_score = sacrebleu.corpus_bleu( preds, [labels], lowercase=True, tokenize=self._data_config.get("sacrebleu_tokenize", "13a"), ) return {"bleu": bleu_score.score}
def score_corpus_multiprocess(self, hypothesis: List[str], references: List[List[str]]) -> float: tokenizer = get_optional_dict(self.extra_args, 'bleu_tokenizer', 'none') if self.n_workers == 1: corpus_score = sb.corpus_bleu(hypothesis, references, force=True, tokenize=tokenizer).score else: batches = list( self._batch(hypothesis, references, n_batches=self.n_workers)) ref_len, sys_len = 0, 0 correct = [0 for _ in range(sb.NGRAM_ORDER)] total = [0 for _ in range(sb.NGRAM_ORDER)] with ProcessPoolExecutor(max_workers=self.n_workers) as executor: futures = [ executor.submit(sb.corpus_bleu, b[0], b[1], force=True, tokenize=tokenizer) for b in batches ] progress = as_completed(futures) if self.verbose: progress = tqdm(progress) for future in progress: s = future.result() ref_len += s.ref_len sys_len += s.sys_len for n in range(sb.NGRAM_ORDER): correct[n] += s.counts[n] total[n] += s.totals[n] corpus_score = sb.compute_bleu(correct, total, sys_len, ref_len, smooth_method='exp').score return corpus_score
def eval_measure(gold, sys, eval_type='acc'): ''' Evaluation measure This takes in gold labels and system outputs and evaluates their accuracy. It currently supports: * Accuracy (acc), percentage of labels that match * Pearson's correlation coefficient (pearson) * BLEU score (bleu) * BLEU_detok, on detokenized references and translations, with internal tokenization :param gold: the correct labels :param sys: the system outputs :param eval_type: The type of evaluation to do (acc, pearson, bleu, bleu_detok) ''' if eval_type == EVAL_TYPE_ACC: return sum([1 if g == s else 0 for g, s in zip(gold, sys)]) / float(len(gold)) elif eval_type == EVAL_TYPE_BLEU: import nltk gold_wrap = [[x] for x in gold] return nltk.translate.bleu_score.corpus_bleu(gold_wrap, sys) elif eval_type == EVAL_TYPE_PEARSON: return np.corrcoef([gold, sys])[0, 1] elif eval_type == EVAL_TYPE_BLEU_DETOK: import sacrebleu # make sure score is 0-based instead of 100-based return sacrebleu.corpus_bleu(sys, [gold]).score / 100. elif eval_type == EVAL_TYPE_F1: return f1_score(gold, sys) elif eval_type == EVAL_TYPE_MACRO_F1: return f1_score(gold, sys, average="macro") elif eval_type == EVAL_TYPE_PREC: return precision_score(gold, sys) elif eval_type == EVAL_TYPE_REC: return recall_score(gold, sys) elif eval_type == EVAL_TYPE_AVG: return np.mean(sys) else: raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)
def calculate_bleu(model, level, raw_hypo, raw_ref): # hypo and ref are word_index hypotheses = model.trg_vocab.arrays_to_sentences(arrays=raw_hypo, cut_at_eos=True) references = model.trg_vocab.arrays_to_sentences(arrays=raw_ref, cut_at_eos=True) #print('hypothese', hypotheses) #print('reference', references) join_char = " " if level in ["word", "bpe"] else "" # valid_sources = [join_char.join(s) for s in data.src] valid_references = [join_char.join(t) for t in references] valid_hypotheses = [join_char.join(t) for t in hypotheses] bleu_score = sacrebleu.corpus_bleu(sys_stream=valid_hypotheses, ref_streams=[valid_references], smooth_method='floor', smooth_value=0.01).score #print('bleu', bleu_score) bleu_score = torch.Tensor([bleu_score]) bleu_score_sigmoid = torch.sigmoid(bleu_score) #print('sigmoid_bleu', bleu_score_sigmoid) return bleu_score_sigmoid
def computeBLEU(outputs, targets): targets = [[t[i] for t in targets] for i in range(len(targets[0]))] return corpus_bleu(outputs, targets, lowercase=True).score