Esempio n. 1
0
def get_scores(enc_sources,
               enc_target_sents,
               model,
               device,
               tokenizersrc,
               tokenizertrg,
               search="greedy",
               n=4):
    """
    takes a list of sentences and their translations in string form and returns score objects
    model is the trained transformer model
    tokenizer is the spm sentencpiece vocabulary in the form "name".model
    search is the decoding strategy, either greedy or beam search
    n is the beam width in beam search
    """
    model.eval()
    sp.load(tokenizertrg)
    targets = []
    outputs = []
    target_str = [sp.DecodeIds(sent.tolist()) for sent in enc_target_sents]
    output_str = []
    if search == "greedy":
        x = divide_chunks(enc_sources, 100)
        output_str = []
        for sents in x:
            print((len(output_str) / len(enc_sources)) * 100, end="\r")
            y = translate_enc_sentences(model,
                                        sents,
                                        device,
                                        tokenizertrg,
                                        max_length=150)
            output_str.extend(y)
        bleu = sacrebleu.corpus_bleu(output_str, [target_str])
        chrf = sacrebleu.corpus_chrf(output_str, [target_str])
        ter = sacrebleu.corpus_ter(output_str, [target_str])

        return bleu, chrf, ter
    elif search == "beam":
        prediction = beam_search(source, device, tokenizersrc, tokenizertrg, n)
    sp.Load(tokenizertrg)
    target = sp.DecodeIds(target.tolist())
    targets.append([target.split()])
    target_str.append(target)
    outputs.append(prediction.split())
    output_str.append(prediction)

    bleu = sacrebleu.corpus_bleu(output_str, [target_str])
    chrf = sacrebleu.corpus_chrf(output_str, [target_str])
    ter = sacrebleu.corpus_ter(output_str, [target_str])
    return bleu, chrf, ter
Esempio n. 2
0
def eval_metric(args, hypos, ref):
    if args.metric == "bleu":
        score = sacrebleu.corpus_bleu(hypos, [ref]).score
    else:
        score = sacrebleu.corpus_ter(hypos, [ref]).score

    return score
Esempio n. 3
0
    def _compute_bleu_ter(self, batch, user_vec, glob_vec, output, target):
        vocab = self.bert_tok.ids_to_tokens

        batch_data = self.translator.translate_batch(batch, vocab, False)
        translations = self.from_batch(batch_data)

        sys = []
        for trans in translations:
            line = ' '.join(trans[0]).replace(' ##', '').replace('##', '')
            sys.append(line)

        references = []
        for tgt in batch.tgt.squeeze(-1).transpose(0, 1):
            references.append(self._build_target_tokens(tgt))

        refs = []
        for ref in references:
            line = ' '.join(ref[1:]).replace(' ##', '').replace('##', '')
            refs.append(line)

        bleu = sacrebleu.corpus_bleu(sys, [refs], force=True)
        ter = sacrebleu.corpus_ter(sys, [refs])

        bleu_stats = onmt.utils.Statistics(bleu=bleu.score * len(sys),
                                           sent=len(sys))
        ter_stats = onmt.utils.Statistics(ter=ter.score * len(sys),
                                          sent=len(sys))

        return (bleu_stats, ter_stats)
def compute_metrics(hyp_dec_all,
                    ref_dec_all,
                    use_sacrebleu=True,
                    use_torchtext=True,
                    use_ter=False):
    metrics = {}

    # Sacrebleu
    if use_sacrebleu:
        metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf(
            hyp_dec_all, [ref_dec_all]).score
        if use_ter:  # Quite slow
            metrics["sacrebleu_ter"] = sacrebleu.corpus_ter(
                hyp_dec_all, [ref_dec_all]).score

    # Torchtext
    if use_torchtext:
        m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all],
                                  [[x.split(" ")] for x in ref_dec_all])
        metrics["torchtext_bleu"] = m_bleu_score * 100
    return metrics
def compute_metrics(ref, hyp, hyp_order, metric):
    # Read sentences
    refs = []
    hyps = []
    for id in hyp_order:
        for segment in hyp[id]:
            hyps.append(segment)
        try:
            for segment in ref[id]:
                refs.append(segment)
        except KeyError:
            sys.stderr.write('Error: there are no references for document'
                             + ' "' + id + '"\n')
            sys.exit(-1)

    scores = []
    dir = os.path.dirname(os.path.realpath(sys.argv[0]))
    for n in range(len(hyps)):
        if metric == 'bleu':
            try:
                score = sacrebleu.corpus_bleu([hyps[n]], [[refs[n]]])
            except EOFError:
                sys.stderr.write('Error: source and reference have different'
                                 + ' lengths.\n')
                sys.exit(-1)

        elif metric == 'ter':
            try:
                score = sacrebleu.corpus_ter([hyps[n]], [[refs[n]]])
            except EOFError:
                sys.stderr.write('Error: source and reference have different'
                                 + ' lengths.\n')
                sys.exit(-1)

        else:
            hyps_file = save_to_file(hyps[n])
            refs_file = save_to_file(refs[n])
            try:
                process = subprocess.Popen((dir + '/beer_2.0/beer -s '
                                            + hyps_file + ' -r '
                                            + refs_file).split(),
                                           stdout=subprocess.PIPE)
                score, error = process.communicate()
            except FileNotFoundError:
                sys.stderr.write('Error: Beer requirement has not been'
                                 + 'satisfied.\n')
                sys.exit(-1)

            # Delete aux files
            process = subprocess.Popen(('rm ' + hyps_file + ' '
                                        + refs_file).split(),
                                       stdout=subprocess.PIPE)
            output, error = process.communicate()

        if metric == 'beer':
            scores.append([float(score.split()[-1])])
        else:
            scores.append([score.score])

    return scores
Esempio n. 6
0
def ter(items):
    """Translation Error Rate is an error metric for machine translation that
    measures the number of edits required to change a system output into one
    of the references
    Source: http://www.cs.umd.edu/~snover/tercom/
    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf

    Lower is better
    """
    refs = list(zip(*items))[0]
    preds = list(zip(*items))[1]
    refs, preds = _sacreformat(refs, preds)
    return sacrebleu.corpus_ter(preds, refs).score
Esempio n. 7
0
def compute_metrics(ref, hyp, hyp_order):
    # Read sentences
    refs = []
    hyps = []
    for id in hyp_order:
        for segment in hyp[id]:
            hyps.append(segment)
        try:
            for segment in ref[id]:
                refs.append(segment)
        except KeyError:
            sys.stderr.write('Error: there are no references for document' +
                             ' "' + id + '"\n')
            sys.exit(-1)

    # Compute BLEU and TER
    try:
        bleu = sacrebleu.corpus_bleu(hyps, [refs])
        ter = sacrebleu.corpus_ter(hyps, [refs])
    except EOFError:
        sys.stderr.write('Error: source and reference have different' +
                         ' lengths.\n')
        sys.exit(-1)

    # Create aux files for BEER
    dir = os.path.dirname(os.path.realpath(sys.argv[0]))
    hyps_file = save_to_file(hyps)
    refs_file = save_to_file(refs)

    # Compute BEER
    try:
        process = subprocess.Popen((dir + '/beer_2.0/beer -s ' + hyps_file +
                                    ' -r ' + refs_file).split(),
                                   stdout=subprocess.PIPE)
        beer, error = process.communicate()
    except FileNotFoundError:
        sys.stderr.write('Error: Beer requirement has not been satisfied.\n')
        sys.exit(-1)

    # Delete aux files
    process = subprocess.Popen(('rm ' + hyps_file + ' ' + refs_file).split(),
                               stdout=subprocess.PIPE)
    output, error = process.communicate()

    return bleu.score, ter.score, float(beer.split()[-1])
Esempio n. 8
0
 def __call__(self, ref_path, hyp_path):
   with tf.io.gfile.GFile(ref_path) as ref_stream, tf.io.gfile.GFile(hyp_path) as sys_stream:
     ter = sacrebleu.corpus_ter(sys_stream, [ref_stream])
     return ter.score
Esempio n. 9
0
 def __call__(self, ref_path, hyp_path):
     sys_stream = _get_lines(hyp_path)
     ref_stream = _get_lines(ref_path)
     ter = sacrebleu.corpus_ter(sys_stream, [ref_stream])
     return ter.score
Esempio n. 10
0
def test(
    experiment: str,
    test_set: str,
    cuda: bool,
    seed: int,
    sample: bool,
    top_p: float,
    temperature: float,
    num_beams: int,
    to_json: str,
) -> None:
    """Testing function where a trained model is tested in its ability to rank candidate
    answers and produce replies.
    """
    logging.disable(logging.WARNING)
    model = PersonaGPT2.from_experiment(experiment)
    seed_everything(seed)

    cuda = cuda and torch.cuda.is_available()
    if cuda:
        model.to("cuda")

    with open(test_set, "r", encoding="utf-8") as f:
        dataset = json.loads(f.read())

    replies, rankings = [], []
    for dialog in tqdm(dataset, desc="Scoring dialogs...", dynamic_ncols=True):

        # 1) Prepares Persona
        persona = dialog["personality"].copy()
        persona_ids = [model.tokenizer.encode(s) for s in persona]

        for utterance in dialog["utterances"]:

            # 2) Saves Ground-Truth
            ground_truth_reply = utterance["candidates"][-1]

            # 3) Prepares History
            history = utterance["history"][-(2 * model.hparams.max_history +
                                             1):]
            history_ids = [model.tokenizer.encode(h) for h in history]

            # 4) Rank Candidates in batch:
            batch = []
            for j, candidate in enumerate(utterance["candidates"]):
                candidate_ids = model.tokenizer.encode(candidate)
                instance = DataModule.build_input(
                    tokenizer=model.tokenizer,
                    persona=persona_ids,
                    history=history_ids,
                    reply=candidate_ids,
                )
                batch.append(instance)

            # from list of dictionaries to dictionary of lists
            batch = {k: [d[k] for d in batch] for k in batch[0]}
            batch = DataModule.pad_dataset(batch)
            if cuda:
                batch = {
                    k: torch.LongTensor(v).cuda()
                    for k, v in batch.items()
                }
            else:
                batch = {k: torch.LongTensor(v) for k, v in batch.items()}

            mc_logits = model(**batch).mc_logits

            rankings.append({
                "persona":
                persona,
                "history":
                history,
                "candidates":
                utterance["candidates"],
                "ranking":
                torch.topk(mc_logits,
                           len(utterance["candidates"])).indices.tolist(),
            })

            # 5) Generates Reply
            bot_input = DataModule.build_input(tokenizer=model.tokenizer,
                                               persona=persona_ids,
                                               history=history_ids)
            # Nucleus Sampling
            if sample:
                history_ids = model.generate(
                    input_ids=torch.LongTensor([bot_input["input_ids"]
                                                ]).cuda()
                    if cuda else torch.LongTensor([bot_input["input_ids"]]),
                    token_type_ids=torch.LongTensor(
                        [bot_input["token_type_ids"]]).cuda() if cuda else
                    torch.LongTensor([bot_input["token_type_ids"]]),
                    max_length=200,
                    do_sample=True,
                    top_p=top_p,
                    temperature=0.7,
                )
            # Beam Search
            else:
                history_ids = model.generate(
                    input_ids=torch.LongTensor([bot_input["input_ids"]
                                                ]).cuda()
                    if cuda else torch.LongTensor([bot_input["input_ids"]]),
                    token_type_ids=torch.LongTensor(
                        [bot_input["token_type_ids"]]).cuda() if cuda else
                    torch.LongTensor([bot_input["token_type_ids"]]),
                    max_length=200,
                    num_beams=num_beams,
                    no_repeat_ngram_size=2,
                    early_stopping=True,
                )

            bot_reply_ids = history_ids[:, len(bot_input["input_ids"]):][0]
            bot_reply = model.tokenizer.decode(bot_reply_ids,
                                               skip_special_tokens=True)

            replies.append({
                "persona":
                persona,
                "history":
                history,
                "bot":
                " ".join(wordpunct_tokenize(bot_reply.lower())),
                "human":
                ground_truth_reply,
            })

    # 6) Runs Ranking Metrics
    hits_1, hits_5, hits_10 = [], [], []
    for ranks in rankings:
        hits_1.append((len(ranks["candidates"]) - 1) in ranks["ranking"][:1])
        hits_5.append((len(ranks["candidates"]) - 1) in ranks["ranking"][:5])
        hits_10.append((len(ranks["candidates"]) - 1) in ranks["ranking"][:10])

    click.secho("Hits@1: {}".format(sum(hits_1) / len(hits_1)), fg="yellow")
    click.secho("Hits@5: {}".format(sum(hits_5) / len(hits_5)), fg="yellow")
    click.secho("Hits@10: {}".format(sum(hits_10) / len(hits_10)), fg="yellow")

    # 7) Runs Generation Metrics
    refs = [[s["human"] for s in replies]]
    sys = [s["bot"] for s in replies]

    bleu = sacrebleu.corpus_bleu(sys, refs, lowercase=True,
                                 tokenize="intl").score
    click.secho(f"BLEU: {bleu}", fg="blue")
    ter = sacrebleu.corpus_ter(sys, refs, no_punct=True).score
    click.secho(f"TER: {ter}", fg="blue")

    # BERTScore returns precison, recall, f1.. we will use F1
    bertscore = float(
        bert_score.score(
            cands=sys,
            refs=refs[0],
            lang="en",
            verbose=False,
            nthreads=4,
        )[2].mean())
    click.secho(f"BERTScore: {bertscore}", fg="blue")

    # 8) Saves results.
    if isinstance(to_json, str):
        data = {
            "results": {
                "BLEU": bleu,
                "TER": ter,
                "BERTScore": bertscore,
                "Hits@1": sum(hits_1) / len(hits_1),
                "Hits@5": sum(hits_5) / len(hits_5),
                "Hits@10": sum(hits_10) / len(hits_10),
            },
            "generation": replies,
            "ranking": rankings,
        }
        with open(to_json, "w") as outfile:
            json.dump(data, outfile, ensure_ascii=False, indent=4)
        click.secho(f"Predictions saved in: {to_json}.", fg="yellow")
Esempio n. 11
0
def get_ter(in_sent, target_sent):
    ter = sacrebleu.corpus_ter([in_sent], [[target_sent]])
    out = " ".join(map(str, [ter.score, ter.num_edits, ter.ref_length]))
    return out
Esempio n. 12
0
def compute_ter_score(hyps: Iterable[str], refs: List[Iterable[str]]) -> float:
    result = sacrebleu.corpus_ter(hyps, refs)
    return float(np.round(float(result.score) * 100, 2))