Example #1
0
def eval_measure(gold, sys, eval_type='acc'):
  ''' Evaluation measure
  
  This takes in gold labels and system outputs and evaluates their
  accuracy. It currently supports:
  * Accuracy (acc), percentage of labels that match
  * Pearson's correlation coefficient (pearson)
  * BLEU score (bleu)
  * BLEU_detok, on detokenized references and translations, with internal tokenization

  :param gold: the correct labels
  :param sys: the system outputs
  :param eval_type: The type of evaluation to do (acc, pearson, bleu, bleu_detok)
  '''
  if eval_type == EVAL_TYPE_ACC:
    return sum([1 if g == s else 0 for g, s in zip(gold, sys)]) / float(len(gold))
  elif eval_type == EVAL_TYPE_BLEU:
    import nltk
    gold_wrap = [[x] for x in gold]
    return nltk.translate.bleu_score.corpus_bleu(gold_wrap, sys)
  elif eval_type == EVAL_TYPE_PEARSON:
    return np.corrcoef([gold, sys])[0,1]
  elif eval_type == EVAL_TYPE_BLEU_DETOK:
    import sacrebleu
    # make sure score is 0-based instead of 100-based
    return sacrebleu.corpus_bleu(sys, [gold]).score / 100.
  else:
    raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)
Example #2
0
    def score_batch(self,
                    hypotheses: List[List[str]],
                    references: List[List[str]]) -> float:

        hyp_joined = [" ".join(hyp) for hyp in hypotheses]
        ref_joined = [" ".join(ref) for ref in references]

        bleu = corpus_bleu(hyp_joined, [ref_joined],
                           smooth_method=self.smooth_method,
                           smooth_value=self.smooth_value,
                           force=self.force,
                           lowercase=self.lowercase,
                           tokenize=self.tokenize,
                           use_effective_order=self.use_effective_order)

        return bleu.score
Example #3
0
def bleu(targets, predictions):
    predictions = [tf.compat.as_text(x) for x in predictions]

    if isinstance(targets[0], list):
        targets = [[tf.compat.as_text(x) for x in target]
                   for target in targets]
    else:
        targets = [tf.compat.as_text(x) for x in targets]
        targets = [targets]

    bleu_score = corpus_bleu(predictions,
                             targets,
                             smooth_method="exp",
                             smooth_value=0.0,
                             force=False,
                             lowercase=False,
                             tokenize="ja",
                             use_effective_order=False)
    return {"bleu": bleu_score.score}
Example #4
0
def get_score_by_task(task):
    results = {}
    for lg in task2lg[task].split():
        preds = []
        labels = []
        guid = 0
        guids = []
        if task == "MLQA":
            datasets, preds = load_mlqa_data(task, lg)
        elif task in ["NC", "QADSM", "QAM", "WPR"]:
            preds, labels, guids = load_data(task, lg)
        elif task in ["QG", "NTG"]:
            preds, labels = load_qg_ntg_data(task, lg)
        elif task in ["NER", "POS"]:
            preds, labels = load_ner_pos_data(task, lg)
        elif task == "XNLI":
            labels, preds = load_xnli_data(task, lg)
        elif task == "PAWSX":
            labels, preds = load_pawsx_data(task, lg)

        if task == "MLQA":
            results[lg] = mlqa_evaluate(datasets["data"], preds,
                                        lg)["f1"] / 100  #Normalize
        elif task == "NER":
            results[lg] = f1_score(labels, preds)
        elif task == "POS":
            results[lg] = precision_score(labels, preds)
        elif task in ["NC", "XNLI", "PAWSX", "QADSM", "QAM"]:
            results[lg] = simple_accuracy(preds, labels)
        elif task == "WPR":
            results[lg] = simple_ndcg(preds, labels, guids)
        elif task == "QG" or task == "NTG":
            results[lg] = sacrebleu.corpus_bleu(
                preds, [labels], lowercase=True).score / 100  #Normalize
    avg = 0
    count = 0
    for key in results.keys():
        avg += results[key]
        count += 1
    avg /= count
    results["avg"] = avg
    return results
Example #5
0
def fetch_translations_and_bleu(model:nn.Module,
                            dataset:DataLoader, 
                            tokenizer:Tokenizer,
                            iterations=10,
                            k=1,
                            num_samples=6000):
    """
        BLEU keeping number of samples in training and validation same
    """
    model.eval()

    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda")
        model.cuda()

    pred = []
    tgt = []
    src = []

    for batch in tqdm(dataset, desc="predicting ... ", leave=False):

        for k in batch:
            batch[k] = batch[k].to(device)

        out = model.generate(**batch, iterations=iterations, tokenizer=tokenizer, k=k)
        pred.extend(out["tgt_text"])
        src.extend(tokenizer.batch_decode(batch["input_ids"], is_src_txt=True))
        tgt.extend(tokenizer.batch_decode(batch["labels"], is_tgt_txt=True))

        if len(pred) > num_samples:
            break

    # bleu score
    bleu = corpus_bleu(pred, [tgt]).score

    return {
        "bleu": bleu,
        "src": src,
        "tgt": tgt,
        "pred": pred
    }
Example #6
0
def cal_bleu(samples, task, args):

    tokenizer = encoders.build_tokenizer(args)
    bpe = encoders.build_bpe(args)

    def decode_fn(x):
        return (x + ' ').replace('@@ ', '').rstrip()

    tgt_dict = task.target_dictionary

    target_tensor = samples['target']
    cand_tensor = samples['cand']
    batch_bleu = []
    assert len(target_tensor) == len(cand_tensor)

    for i in range(len(target_tensor)):
        tgt_tokens = utils.strip_pad(target_tensor[i],
                                     tgt_dict.pad()).int().cpu()
        cand_tokens = utils.strip_pad(cand_tensor[i],
                                      tgt_dict.pad()).int().cpu()

        tgt_str = tgt_dict.string(tgt_tokens,
                                  None,
                                  escape_unk=True,
                                  extra_symbols_to_ignore={tgt_dict.eos()})
        tgt_str = decode_fn(tgt_str)

        cand_str = tgt_dict.string(cand_tokens,
                                   None,
                                   escape_unk=True,
                                   extra_symbols_to_ignore={tgt_dict.eos()})
        cand_str = decode_fn(cand_str)

        bleuscore = sacrebleu.corpus_bleu([cand_str], [[tgt_str]],
                                          use_effective_order=True)
        batch_bleu.append(bleuscore.score)

    samples['bleu'] = batch_bleu

    assert len(batch_bleu) == len(target_tensor)

    return samples
Example #7
0
    def _compute_score(self):
        """Computes sacreBLEU score for current submission."""

        sgml_path = str(self.sgml_file.name)
        text_path = sgml_path.replace('.sgm', '.txt')
        ref_path = Path(self.test_set.reference_file.name)

        from sacrebleu import process_to_text, corpus_bleu

        # Extract raw text from SGML file
        if not Path(text_path).exists():
            process_to_text(sgml_path, text_path)

        hyp_stream = [x for x in open(text_path, encoding='utf-8')]
        ref_stream = [r for r in open(ref_path, encoding='utf-8')]

        bleu = corpus_bleu(hyp_stream, [ref_stream])

        self.score = bleu.score
        self.save()
Example #8
0
    def bleu_score(self):
        hypotheses = []
        for prompt in self.pred:
            d = self.pred[prompt]
            pred_lst = sorted(d.items(),
                              key=lambda item: item[1],
                              reverse=True)
            hypotheses.append(pred_lst[0][0])
        references = []
        for prompt in self.gold:
            d = self.gold[prompt]
            ref_lst = sorted(d.items(), key=lambda item: item[1], reverse=True)
            ref_lst = [item[0] for item in ref_lst]
            references.append(ref_lst)
        references_t = [
            list(ref_t) for ref_t in zip_longest(*references, fillvalue='')
        ]

        bleu = sacrebleu.corpus_bleu(hypotheses, references_t)
        return bleu.score
Example #9
0
 def sacre_bleu(self, hypo, groundtruth=None, lc=False):
     if groundtruth is None:
         ref = self._refs_for_sacre
     else:
         if isinstance(groundtruth[0], str):
             ref = [groundtruth]
         else:
             ref = groundtruth
     try:
         bleu = sacrebleu.corpus_bleu(
             hypo, ref, lowercase=lc, tokenize=self._sacre_tokenize_str)
         return bleu.score
     except IndexError:
         logging.info("Found empty lines.")
         print(traceback.format_exc())
         bleu = 0.
     except ZeroDivisionError:
         logging.info("Empty reference")
         print(traceback.format_exc())
         return 0.
Example #10
0
def bleu(corpus, truths):
    '''
    corpus: list, NBs * BATCHSIZE * MAX_LEN
    truths: list, NBs * BATCHSIZE * MAX_LEN
    
    return: array of length NBs, avg blue score for each batch
    '''
    n = len(corpus)
    bleus = [0]*n
    for i in range(n):
        pred, true = corpus[i], truths[i]
        sumbleu = 0.0
        
        for j in range(len(corpus[i])):
            pred_tensor, true_tensor = pred[j], true[j]
            pred_sent, true_sent = convert_idx_2_sent(pred_tensor, true_tensor, target_tra)
            sumbleu += corpus_bleu(true_sent, pred_sent).score
        avgbleu = sumbleu / len(corpus[i])
        bleus[i] = avgbleu
    return bleus
Example #11
0
    def _compute_score(self):
        """Computes sacreBLEU score for current submission."""

        sgml_path = str(self.sgml_file.name)
        text_path = sgml_path.replace('.sgm', '.txt')
        ref_path = 'testsets/wmt18.ende.ref.txt'

        from sacrebleu import process_to_text, corpus_bleu
        from pathlib import Path

        if not Path(text_path).exists():
            process_to_text(sgml_path, text_path)

        hyp_stream = [x for x in open(text_path, encoding='utf-8')]
        ref_stream = [r for r in open(ref_path, encoding='utf-8')]

        bleu = corpus_bleu(hyp_stream, [ref_stream])

        self.score = bleu.score
        self.save()
Example #12
0
def eval_moses_bleu(ref, hyp):
    """
    Given a file of hypothesis and reference files,
    evaluate the BLEU score using Moses scripts.
    """
    assert os.path.isfile(hyp)
    assert os.path.isfile(ref) or os.path.isfile(ref + '0')

    hyps, refs = [], []

    with open(hyp) as fh, open(ref) as rh:
        for line in fh:
            hyps.append(line.strip())

        for line in rh:
            refs.append(line.strip())

        score = sacrebleu.corpus_bleu(hyps, [refs], tokenize='none').score

    return score
Example #13
0
def bs_test(encoder, decoder, data_loader, beam_k, max_length, train_input_lang, train_output_lang):
    count = 0
    
    candidate_corpus = []
    reference_corpus = []

    for i, (input, input_len, target, target_len) in enumerate(data_loader):
        decoded_words = bs_evaluate(encoder, decoder, input, beam_k, max_length)
        candidate_sentences = []
        for ind in range(decoded_words.shape[0]):
            sent_words = []
            for token in decoded_words[ind]:
                if token != PAD_token and token != EOS_token:
                    sent_words.append(train_output_lang.index2word[token])
                else:
                    break
            sent_words = ' '.join(sent_words)
            # print the first sentence in the first batch to peek the translation result
            if count == 0:
                print('predict: '+sent_words)
                count += 1
            candidate_sentences.append(sent_words)
        candidate_corpus.extend(candidate_sentences)

        reference_sentences = []
        for sent in target:
            sent_words = []
            for token in sent:
                if token.item() != EOS_token:
                    sent_words.append(train_output_lang.index2word[token.item()])
                else:
                    break
            sent_words = ' '.join(sent_words)
            if count == 1:
                print('target: '+sent_words)
                count += 1
            reference_sentences.append(sent_words)
        reference_corpus.extend(reference_sentences)
    
    score = corpus_bleu(candidate_corpus, [reference_corpus], smooth='exp', smooth_floor=0.0, force=False).score
    return score
Example #14
0
def evaluatebleu(encoder1, decoder1, loader, with_o=True):

    score = 0
    output_words = []
    true_words = []
    for i, (candidate, length_1, reference, length_2) in enumerate(loader):
        #print(i)
        if with_o == True:
            max_length = max(length_2).item()
            output_words += evaluate(encoder1, decoder1, candidate, length_1,
                                     reference, length_2, max_length)
        else:
            output_words, attentions = evaluate(encoder, decoder, pair[0])
        true_words += mapback(reference)

    score = sacrebleu.corpus_bleu(output_words, [true_words])

    print(output_words[0])
    print(true_words[0])

    return (score, output_words, true_words)
Example #15
0
def write_evals(writer, experiment, translation, file_path, ref, src):
    writer = SummaryWriter("runs/{}-{}".format(experiment, translation))

    steps = int(translation)
    output_path = "translations/{}/{}".format(experiment, translation)
    with open(output_path, "r", encoding="utf-8") as infile:
        system_output = [x.strip() for x in infile.readlines()]
        bleu = sacrebleu.corpus_bleu(system_output, [ref])
        chrf = sacrebleu.corpus_chrf(system_output, [ref])
        rhyme_score, copied, reconstructed = concurrent_score(system_output,
                                                                languages[experiment],
                                                                ref, src)
        print(experiment, translation, bleu.score, rhyme_score, copied, reconstructed)

    wall = os.stat(file_path).st_mtime
    writer.add_scalar(experiment + "/CHRF", chrf.score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/BLEU", bleu.score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Rhyme", rhyme_score, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Copied", copied, global_step=steps, walltime=wall)
    writer.add_scalar(experiment + "/Reconstructed", reconstructed, global_step=steps, walltime=wall)
    writer.flush()
Example #16
0
    def evaluate(self, predicts, answers):
        """
        import sacrebleu

        refs = [['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'],
                ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.']]
        sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']

        bleu = sacrebleu.corpus_bleu(sys, refs)

        bleu.score
        48.530827009929865
        """

        try:
            bleu = sacrebleu.corpus_bleu(predicts, answers, lowercase=True)
        except EOFError:
            print('# preds', len(predicts))
            print('# tgts', len(answers))
            exit()
        return {'BLEU': bleu.score}
Example #17
0
    def validation_epoch_end(self, outputs) -> None:
        unpad = self.trainer.datamodule.unpad
        bpe = yttm.BPE(self.bpe_file)

        loss = torch.stack([o[0] for o in outputs]).mean()
        acc = self.val_acc.compute().item()
        self.val_acc.reset()

        x = bpe.decode([l for o in outputs for l in unpad(o[1])])
        hyp = bpe.decode([l for o in outputs for l in unpad(o[2])])
        y_true = bpe.decode([l for o in outputs for l in unpad(o[3])])
        bleu = sacrebleu.corpus_bleu(hyp, [y_true]).score
        self.log('val/loss', loss, True)
        self.log('val/acc', acc, True)
        self.log('val/bleu', bleu, True)
        nni.report_intermediate_result({'bleu': bleu})

        # save files
        self.save_file(f'val.{self.src_lang}', x)
        self.save_file(f'val.{self.trg_lang}', y_true)
        self.save_file(f'val.{self.trg_lang}.{self.global_step}.hyp', hyp)
Example #18
0
def compute_metrics(ref, hyp, hyp_order):
    refs = []
    hyps = []
    for id in hyp_order:
        for segment in hyp[id]:
            hyps.append(segment)
        try:
            for segment in ref[id]:
                refs.append(segment)
        except KeyError:
            sys.stderr.write('Error: there are no references for document' +
                             ' "' + id + '"\n')
            sys.exit(-1)
    try:
        bleu = sacrebleu.corpus_bleu(hyps, [refs])
        chrf = sacrebleu.corpus_chrf(hyps, [refs])
    except EOFError:
        sys.stderr.write('Error: source and reference have different' +
                         ' lengths.\n')
        sys.exit(-1)
    return bleu.score, chrf.score
def eval(preds: List[str], refs: List[str]) -> float:
    """BLEU score computation.

    Strips all characters belonging to the unicode category "So".
    Tokenize with standard WMT "13a" tokenizer.
    Compute 4-BLEU.

    Args:
        preds (List[str]): List of translated texts.
        refs (List[str]): List of target reference texts.
    """
    preds = [OTHERS_PATTERN.sub(" ", text) for text in preds]
    refs = [OTHERS_PATTERN.sub(" ", text) for text in refs]
    return (
        corpus_bleu(preds, [refs],
                    lowercase=True,
                    tokenize="13a",
                    use_effective_order=False).score,
        preds,
        refs,
    )
Example #20
0
 def eval_bleu_moses(self, ref_file: str, evaluation_dir: str,
                     sys_file: str):
     import sacrebleu
     try:
         os.makedirs(evaluation_dir)
     except FileExistsError:
         logger.warning(evaluation_dir + " is exist")
     subprocess.run([
         f"cat {ref_file} | {MOSES_DETOKENIZER} -l en > {evaluation_dir}/ref.txt"
     ],
                    shell=True)
     subprocess.run([
         f"cat {sys_file} | {MOSES_DETOKENIZER} -l en > {evaluation_dir}/sys.txt"
     ],
                    shell=True)
     with open(f"{evaluation_dir}/ref.txt", 'r+') as file:
         refs = [file.read().split('\n')]
     with open(f"{evaluation_dir}/sys.txt", 'r+') as file:
         sys = file.read().split('\n')
     bleu = sacrebleu.corpus_bleu(sys, refs)
     return bleu.score
    def bleu(self, refs, preds):
        """
        Returns `t5` style BLEU scores. See the related implementation:
        https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41

        :param refs:
            A `list` of `list` of reference `str`s.
        :param preds:
            A `list` of predicted `str`s.
        """
        score = sacrebleu.corpus_bleu(
            preds,
            refs,
            smooth_method="exp",
            smooth_value=0.0,
            force=False,
            lowercase=False,
            tokenize="intl",
            use_effective_order=False,
        ).score
        return score
Example #22
0
 def evaluate(self, result_path):
     from sacrebleu import download_test_set, corpus_bleu, smart_open
     assert os.path.exists(result_path)
     tmp_path = "/tmp/sacrebleu_tmp.txt"
     self.recover_subwords(result_path, tmp_path)
     if self.dataset_token is not None:
         _, *refs = download_test_set(self.dataset_token,
                                      self.langpair_token)
         if not refs:
             raise SystemError(
                 "Error with dataset_token and langpair_token: {} {}".
                 format(self.dataset_token, self.langpair_token))
         refs = [smart_open(x, encoding="utf-8").readlines() for x in refs]
     else:
         refs = [self.ref_lines]
     hyp_lines = open(result_path).readlines()
     bleu = corpus_bleu(hyp_lines,
                        refs,
                        tokenize=self.tokenizer,
                        lowercase=self.lowercase)
     return float(bleu.score)
Example #23
0
def evaluate_bleu(model, iterator):

    model.eval()

    hyp = []
    ref = []

    for batch in tqdm(iterator):
        src, trg = batch.src.T, batch.trg.T
        outputs = search(model, src)

        outputs = outputs[:, 1:]

        hyp += get_text_from_tensor(outputs, TRG)
        ref += get_text_from_tensor(trg, TRG)

    # expand dim of reference list
    # sys = ['translation_1', 'translation_2']
    # ref = [['truth_1', 'truth_2'], ['another truth_1', 'another truth_2']]
    ref = [ref]
    return sacrebleu.corpus_bleu(hyp, ref, force=True).score
Example #24
0
  def cache_stats(self, ref, out):
    """
    Cache sufficient statistics for caculating SacreBLEU score

    Args:
      ref: A reference corpus
      out: An output corpus

    Returns:
      A list of cached statistics
    """
    if self.case_insensitive:
      ref = corpus_utils.lower(ref)
      out = corpus_utils.lower(out)

    cached_stats = []
    for r, o in zip(ref, out):
      re = sacrebleu.corpus_bleu(" ".join(o), " ".join(r))
      cached_stats.append( (re.counts, re.totals, re.sys_len, re.ref_len) )

    return cached_stats
Example #25
0
    def eval_with_bleu(self, model, dataloader):
        import sacrebleu

        def decode(task, toks, escape_unk=False):
            toks = toks.tolist()
            #bos = task.vocab.encode("<s>")
            #eos = task.vocab.encode("</s>")
            bos = task.vocab.model.bos_id()
            eos = task.vocab.model.eos_id()
            while bos in toks:
                toks.remove(bos)
            while eos in toks:
                toks.remove(eos)
            s = task.vocab.decode(toks)
            return s.strip()

        hyps = []
        refs = []
        preds = torch.Tensor([self.vocab.model.bos_id()])
        for batch in tqdm(dataloader):
            mask_batch = batch
            mask_batch['net_input']['prev_output_tokens'] = prev_outputs
            preds = model(**mask_batch['net_input'])
            # print(preds[0][0].shape)
            # print(decode(task,torch.argmax(preds[0][0], dim=1)))
            # print(decode(task,
            #             utils.strip_pad(batch['target'][0], task.vocab.pad()),
            #             escape_unk=True,  # don't count <unk> as matches to the hypo
            #         ))
            for i in range(preds[0].shape[0]):
                hyps.append(decode(self, torch.argmax(preds[0][i], dim=1)))
                refs.append(
                    decode(
                        self,
                        utils.strip_pad(batch['target'][i], self.vocab.pad()),
                        escape_unk=
                        True,  # don't count <unk> as matches to the hypo
                    ))

        return sacrebleu.corpus_bleu(hyps, [refs]), hyps
Example #26
0
def get_all_scores(
    orig_sents: List[str],
    sys_sents: List[str],
    refs_sents: List[List[str]],
    lowercase: bool = False,
    tokenizer: str = '13a',
    metrics: List[str] = DEFAULT_METRICS,
):
    scores = OrderedDict()
    if 'bleu' in metrics:
        scores['BLEU'] = corpus_bleu(sys_sents,
                                     refs_sents,
                                     force=True,
                                     tokenize=tokenizer,
                                     lowercase=lowercase).score
    if 'sari' in metrics:
        scores['SARI'] = corpus_sari(orig_sents,
                                     sys_sents,
                                     refs_sents,
                                     tokenizer=tokenizer,
                                     lowercase=lowercase)
    if 'samsa' in metrics:
        from easse.samsa import corpus_samsa

        scores['SAMSA'] = corpus_samsa(orig_sents,
                                       sys_sents,
                                       tokenizer=tokenizer,
                                       verbose=True,
                                       lowercase=lowercase)
    if 'fkgl' in metrics:
        scores['FKGL'] = corpus_fkgl(sys_sents, tokenizer=tokenizer)
    quality_estimation_scores = corpus_quality_estimation(orig_sents,
                                                          sys_sents,
                                                          tokenizer=tokenizer,
                                                          lowercase=lowercase)
    scores = add_dicts(
        scores,
        quality_estimation_scores,
    )
    return {key: round(value, 2) for key, value in scores.items()}
Example #27
0
def eval_measure(gold, sys, eval_type='bleu'):
    ''' Evaluation measure

    This takes in gold labels and system outputs and evaluates their
    accuracy. It currently supports:
    * Accuracy (acc), percentage of labels that match
    * Pearson's correlation coefficient (pearson)
    * BLEU score (bleu)
    * BLEU_detok, on detokenized references and translations, with internal tokenization
    :param gold: the correct labels (reference)
    :param sys: the system outputs (hypothesis)
    :param eval_type: The type of evaluation to do (bleu, chrf3, hlepor)
    '''
    if eval_type == EVAL_TYPE_BLEU:
        # make sure score is 0-based instead of 100-based
        return corpus_bleu(sys, [gold]).score / 100.
    elif eval_type == EVAL_TYPE_CHRF3:
        return corpus_chrf(sys, [gold], beta=3).score
    elif eval_type == EVAL_TYPE_HLEPOR:
        return hlepor_score(sys, gold)
    else:
        raise NotImplementedError('Unknown eval type in eval_measure: %s' % eval_type)
def cal_score(triple, q, a):
    """
    calculate if the triple(entity, relation, something) appears in the questions and answer
    """

    if len(triple[2]) == 0 or len(triple[3]) == 0:
        return 0  # something empty like ["异灵灵异-2002", "评论", ""]
    qa = q + ' ' + a
    score = 1 if (triple[0].replace(' ', '') in qa.replace(' ', '') or ('天气' == triple[1] and '天气' in q)) else 0  # left entity appears
    score += check_relation(triple[1], qa)  # relation appears
    if triple[1] == '出生地' and score < 2:
        score -= 4  # probably not use birthplace knowledege
    if triple[2] in a:  # something directly appears
        score += 2
    else:
        bleu = sacrebleu.corpus_bleu([a], [[triple[2]]]).score
        if bleu > 10:
            score += 2
        else:
            score -= 2

    return score
Example #29
0
    def compute(
        self,
        labels: Sequence[Text],
        preds: Sequence[Text],
        label_spec: lit_types.TextSegment,
        pred_spec: lit_types.GeneratedText,
        config: Optional[lit_types.JsonDict] = None,
    ) -> Dict[Text, float]:
        del label_spec
        del pred_spec
        del config

        if not labels or not preds:
            return {}

        bleu_score = sacrebleu.corpus_bleu(
            preds,
            [labels],
            lowercase=True,
            tokenize=self._data_config.get("sacrebleu_tokenize", "13a"),
        )
        return {"bleu": bleu_score.score}
Example #30
0
 def score_corpus_multiprocess(self, hypothesis: List[str],
                               references: List[List[str]]) -> float:
     tokenizer = get_optional_dict(self.extra_args, 'bleu_tokenizer',
                                   'none')
     if self.n_workers == 1:
         corpus_score = sb.corpus_bleu(hypothesis,
                                       references,
                                       force=True,
                                       tokenize=tokenizer).score
     else:
         batches = list(
             self._batch(hypothesis, references, n_batches=self.n_workers))
         ref_len, sys_len = 0, 0
         correct = [0 for _ in range(sb.NGRAM_ORDER)]
         total = [0 for _ in range(sb.NGRAM_ORDER)]
         with ProcessPoolExecutor(max_workers=self.n_workers) as executor:
             futures = [
                 executor.submit(sb.corpus_bleu,
                                 b[0],
                                 b[1],
                                 force=True,
                                 tokenize=tokenizer) for b in batches
             ]
             progress = as_completed(futures)
             if self.verbose:
                 progress = tqdm(progress)
             for future in progress:
                 s = future.result()
                 ref_len += s.ref_len
                 sys_len += s.sys_len
                 for n in range(sb.NGRAM_ORDER):
                     correct[n] += s.counts[n]
                     total[n] += s.totals[n]
             corpus_score = sb.compute_bleu(correct,
                                            total,
                                            sys_len,
                                            ref_len,
                                            smooth_method='exp').score
     return corpus_score
Example #31
0
def eval_measure(gold, sys, eval_type='acc'):
    ''' Evaluation measure
  
  This takes in gold labels and system outputs and evaluates their
  accuracy. It currently supports:
  * Accuracy (acc), percentage of labels that match
  * Pearson's correlation coefficient (pearson)
  * BLEU score (bleu)
  * BLEU_detok, on detokenized references and translations, with internal tokenization
  :param gold: the correct labels
  :param sys: the system outputs
  :param eval_type: The type of evaluation to do (acc, pearson, bleu, bleu_detok)
  '''
    if eval_type == EVAL_TYPE_ACC:
        return sum([1 if g == s else 0
                    for g, s in zip(gold, sys)]) / float(len(gold))
    elif eval_type == EVAL_TYPE_BLEU:
        import nltk
        gold_wrap = [[x] for x in gold]
        return nltk.translate.bleu_score.corpus_bleu(gold_wrap, sys)
    elif eval_type == EVAL_TYPE_PEARSON:
        return np.corrcoef([gold, sys])[0, 1]
    elif eval_type == EVAL_TYPE_BLEU_DETOK:
        import sacrebleu
        # make sure score is 0-based instead of 100-based
        return sacrebleu.corpus_bleu(sys, [gold]).score / 100.
    elif eval_type == EVAL_TYPE_F1:
        return f1_score(gold, sys)
    elif eval_type == EVAL_TYPE_MACRO_F1:
        return f1_score(gold, sys, average="macro")
    elif eval_type == EVAL_TYPE_PREC:
        return precision_score(gold, sys)
    elif eval_type == EVAL_TYPE_REC:
        return recall_score(gold, sys)
    elif eval_type == EVAL_TYPE_AVG:
        return np.mean(sys)
    else:
        raise NotImplementedError('Unknown eval type in eval_measure: %s' %
                                  eval_type)
Example #32
0
def calculate_bleu(model, level, raw_hypo, raw_ref):
    # hypo and ref are word_index
    hypotheses = model.trg_vocab.arrays_to_sentences(arrays=raw_hypo,
                                                          cut_at_eos=True)
    references = model.trg_vocab.arrays_to_sentences(arrays=raw_ref,
                                                          cut_at_eos=True)
    #print('hypothese', hypotheses)
    #print('reference', references)

    join_char = " " if level in ["word", "bpe"] else ""
    # valid_sources = [join_char.join(s) for s in data.src]
    valid_references = [join_char.join(t) for t in references]
    valid_hypotheses = [join_char.join(t) for t in hypotheses]
    bleu_score = sacrebleu.corpus_bleu(sys_stream=valid_hypotheses,
                                       ref_streams=[valid_references],
                                       smooth_method='floor',
                                       smooth_value=0.01).score
    #print('bleu', bleu_score)
    bleu_score = torch.Tensor([bleu_score])
    bleu_score_sigmoid = torch.sigmoid(bleu_score)
    #print('sigmoid_bleu', bleu_score_sigmoid)
    return bleu_score_sigmoid
Example #33
0
def computeBLEU(outputs, targets):
    targets = [[t[i] for t in targets] for i in range(len(targets[0]))]
    return corpus_bleu(outputs, targets, lowercase=True).score