Esempio n. 1
0
def test_offset(hypothesis, reference, expected_with_offset,
                expected_without_offset):
    score_without_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference,
                                                     0.0).score / 100
    assert abs(expected_without_offset - score_without_offset) < EPSILON

    score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference,
                                                  0.1).score / 100
    assert abs(expected_with_offset - score_with_offset) < EPSILON
Esempio n. 2
0
def test_offset(hypothesis, reference, expected_with_offset,
                expected_without_offset):
    score_without_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference,
                                                     0.0).score / 100
    assert abs(expected_without_offset - score_without_offset) < EPSILON

    # let it use BLEU's internal default of 0.1 through passing `None`
    score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference,
                                                  None).score / 100
    assert abs(expected_with_offset - score_with_offset) < EPSILON

    # let it use BLEU's internal default of 0.1
    score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis,
                                                  reference).score / 100
    assert abs(expected_with_offset - score_with_offset) < EPSILON
Esempio n. 3
0
def eval_val(file_name, model, valid_iter, targ_field, datasets):
    references = [" ".join(example.trg) for example in datasets["val"]]

    hypotheses = []
    alphas = []  # save the last attention scores
    for batch in valid_iter:
        batch = rebatch(PAD_INDEX, batch)
        pred, attention = beam_decode(
            model,
            batch.src,
            batch.src_mask,
            batch.src_lengths,
            sos_index=targ_field.vocab.stoi[SOS_TOKEN],
            eos_index=targ_field.vocab.stoi[EOS_TOKEN])
        hypotheses.append(pred)
        alphas.append(attention)

    hypotheses = [lookup_words(x, TRG.vocab) for x in hypotheses]
    hypotheses = [" ".join(x) for x in hypotheses]

    with open(file_name, "w") as file:
        for line in hypotheses:
            file.write(line + "\n")

    bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score
    print(bleu)
Esempio n. 4
0
 def get_bleu_metrics(self, dev_samples, model: Model):
     reached_eof = False
     current_index = 0
     all_hypothesis_sentences = []
     all_reference_sentences = []
     while not reached_eof:
         batch_x, batch_y, batch_z, batch_tt, actual_batch_size, reached_eof = read_batch_from_samples(
             dev_samples, self.batch_size, self.token_per_batch,
             current_index, model.config.data_config.input_features,
             model.config.data_config.output_features,
             model.config.data_config.output_translations,
             model.config.data_config.output_translation_features,
             model.config.data_config.input_clear_text,
             model.config.data_config.output_translation_clear_text)
         if actual_batch_size == 0:
             break
         reference = unpad_turn_to_text_and_remove_bpe_of_batch_t(
             batch_tt[0][0],
             model.config.data_config.output_translation_vocabularies[0][0])
         for sentence in reference:
             all_reference_sentences.append(sentence)
         output = model.predict_translation_on_batch(batch_x)
         output = unpad_turn_to_text_and_remove_bpe_of_batch_t(
             output,
             model.config.data_config.output_translation_vocabularies[0][0])
         for sentence in output:
             all_hypothesis_sentences.append(sentence)
         current_index += actual_batch_size
         if reached_eof is True:
             break
     bleu = sacrebleu.raw_corpus_bleu(sys_stream=all_hypothesis_sentences,
                                      ref_streams=[all_reference_sentences])
     return bleu.score
def compute_metrics(hyp_dec_all,
                    ref_dec_all,
                    use_sacrebleu=True,
                    use_torchtext=True,
                    use_ter=False):
    metrics = {}

    # Sacrebleu
    if use_sacrebleu:
        metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf(
            hyp_dec_all, [ref_dec_all]).score
        if use_ter:  # Quite slow
            metrics["sacrebleu_ter"] = sacrebleu.corpus_ter(
                hyp_dec_all, [ref_dec_all]).score

    # Torchtext
    if use_torchtext:
        m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all],
                                  [[x.split(" ")] for x in ref_dec_all])
        metrics["torchtext_bleu"] = m_bleu_score * 100
    return metrics
Esempio n. 6
0
def bleu_eval(ref_trans, new_trans, raw_trans=True):
    # returns a bleu score
    # input lists of strings, must be of the same length!
    if raw_trans:
        return sacrebleu.raw_corpus_bleu(new_trans, [ref_trans]).score
    else:
        return sacrebleu.corpus_bleu(new_trans, [ref_trans]).score
Esempio n. 7
0
def score(predicted, predicted_imgs, gold, gold_imgs):
    scores = dict()
    # BLEU
    bleu = sacrebleu.raw_corpus_bleu(predicted, [gold]).score
    scores['bleu'] = bleu
    # todo: String edit distance
    # todo: Image difference
    return scores
Esempio n. 8
0
def calculate_bleu_score(hyps: list, refs: list) -> float:
    """
    calculates bleu score.
    """
    assert len(refs) == len(
        hyps), "no of hypothesis and references sentences must be same length"
    bleu = raw_corpus_bleu(hyps, [refs])
    return bleu.score
Esempio n. 9
0
def calculate_bleu(predictions, labels):
    """
    Only pass a list of strings 
    """
    # tthis is ony with n_gram = 4

    bleu = sacrebleu.raw_corpus_bleu(predictions, [labels], .01).score
    return bleu
def calculate_bleu(predictions, labels):
    """
    Only pass a list of strings 
    """
    # tthis is ony with n_gram = 4

    bleu = sacrebleu.raw_corpus_bleu(predictions, [labels], .01).score
    return bleu
Esempio n. 11
0
def bleu(hypotheses, references):
    """
    Raw corpus BLEU from sacrebleu (without tokenization)
    :param hypotheses: list of hypotheses (strings)
    :param references: list of references (strings)
    :return:
    """
    return sacrebleu.raw_corpus_bleu(sys_stream=hypotheses,
                                     ref_streams=[references]).score
Esempio n. 12
0
def raw_corpus_bleu(hypotheses: Iterable[str], references: Iterable[str], offset: Optional[float] = 0.01) -> float:
    """
    Simple wrapper around sacreBLEU's BLEU without tokenization and smoothing.

    :param hypotheses: Hypotheses stream.
    :param references: Reference stream.
    :param offset: Smoothing constant.
    :return: BLEU score as float between 0 and 1.
    """
    return sacrebleu.raw_corpus_bleu(hypotheses, [references], smooth_floor=offset).score / 100.0
Esempio n. 13
0
    def score_results(self, results):
        if self.tgt_dataset is None:
            return []

        ref_stream = (line.replace(self.bpe_symbol, '')
                      for line in self.tgt_dataset)
        sys_stream = (line.replace(self.bpe_symbol, '')
                      for line in results[::len(results) // len(self.dataset)])
        bleu = sacrebleu.raw_corpus_bleu(sys_stream, [ref_stream])
        return ["{:.2f} BLEU".format(bleu.score)]
Esempio n. 14
0
def evaluate_bleu(predictions, labels):
    try:
        bleu_sacre = sacrebleu.raw_corpus_bleu(predictions, [labels],
                                               .01).score
    except (KeyboardInterrupt, SystemExit):
        raise
    except BaseException as e:
        print("\nWARNING: Could not compute BLEU-score. Error:", str(e))

    return bleu_sacre
Esempio n. 15
0
def compute_bleu(hypotheses, references, epoch, config, direction, kl=None):
    bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references]).score
    scores = '{}/{}/bleu-scores.txt'.format(config["out_dir"],
                                            config["session"])
    with open(scores, 'a') as f_score:
        sentence = "Epoch: {}, Bleu {}".format(epoch, bleu, direction)
        if kl is not None:
            sentence += ", KL: {}".format(kl)
        sentence += ", Direction {}\n"
        f_score.write(sentence)
    return bleu
Esempio n. 16
0
def compute_bleu(hypotheses, references, subword_token=None):
    """
    Computes sacrebleu for a single set of references.
    """

    # Remove any subword tokens such as "@@".
    if subword_token is not None:
        references = remove_subword_tokens(references, subword_token)
        hypotheses = remove_subword_tokens(hypotheses, subword_token)

    # Compute the BLEU score.
    return sacrebleu.raw_corpus_bleu(hypotheses, [references]).score
Esempio n. 17
0
def bleu(itos, translation_output, reference):
    '''
    Args:
        trg.vocab.itos: a list the match indices to string.
        translation_output: 2D tensor of tranlation output. shape: N x B
        reference: 1D list of reference sentences (words, not indices). len(reference) = B
    '''
    EN_ind2word = np.array(itos)
    detok_translation = detok(translation_output, EN_ind2word)
    bleu_score = sacrebleu.raw_corpus_bleu(detok_translation, [reference], .01).score
    

    return bleu_score
Esempio n. 18
0
  def compute(self,
              labels: Sequence[Text],
              preds: Sequence[Text],
              label_spec: types.TextSegment,
              pred_spec: types.GeneratedText,
              config: Optional[JsonDict] = None) -> Dict[Text, float]:
    """Compute metric(s) between labels and predictions."""
    del label_spec
    del pred_spec
    del config

    if not labels or not preds:
      return {}

    bleu = sacrebleu.raw_corpus_bleu(preds, [labels])
    return {'corpus_bleu': bleu.score}
Esempio n. 19
0
    def __call__(self, output, target):
        """Computes the BLEU score of a translation task

        Args:
            output (:obj:`torch.Tensor`): Translated output (not tokenized)
            target (:obj:`torch.Tensor`): Target labels

        Returns:
            loss (:obj:`torch.Tensor`): BLEU score
        """
        if self.use_raw:
            bleu_score = sacrebleu.raw_corpus_bleu(output, [target]).score
        else:
            bleu_score = sacrebleu.corpus_bleu(
                output, [target], tokenize="intl", lowercase=True
            ).score
        return torch.tensor([bleu_score])
Esempio n. 20
0
    def evaluate(self, iterator):
        """
        Evaluation loop for the model

        :param iterator: PyTorch DataIterator instance

        :return: average epoch loss
        """
        # disable training of model layers
        self.model.eval()

        epoch_loss = 0
        accuracy = 0

        # don't update model parameters
        with torch.no_grad():
            for i, batch in tqdm(enumerate(iterator),
                                 total=len(iterator),
                                 desc='evaluation loop'):
                # get source and target data
                src, src_lengths = batch.src
                trg, trg_lengths = batch.trg

                output, _ = self.model(src, trg, src_lengths, trg_lengths)
                decoded_output = self.model.decoder.decode_mechanism(output)

                # reshape same as train loop
                y_pred = output[:, 1:].contiguous().view(-1, output.size(-1))
                y = trg[:, 1:].contiguous().view(-1)

                # compute loss
                loss = self.criterion(y_pred, y)

                epoch_loss += loss.item()

                # using BLEU score for machine translation tasks
                accuracy += sacrebleu.raw_corpus_bleu(
                    sys_stream=self.lookup_words(decoded_output),
                    ref_streams=[self.lookup_words(trg)]).score

        # return the average loss
        return epoch_loss / len(iterator), accuracy / len(iterator)
Esempio n. 21
0
  def compute(self,
              labels: Sequence[Text],
              preds: Sequence[Union[Text, types.ScoredTextCandidates]],
              label_spec: types.TextSegment,
              pred_spec: Union[types.GeneratedText,
                               types.GeneratedTextCandidates],
              config: Optional[JsonDict] = None) -> Dict[Text, float]:
    """Compute metric(s) between labels and predictions."""
    del label_spec
    del config

    if not labels or not preds:
      return {}

    name_suffix = ''
    if isinstance(pred_spec, types.GeneratedTextCandidates):
      preds = [types.GeneratedTextCandidates.top_text(v) for v in preds]
      name_suffix = '@1'
    bleu = sacrebleu.raw_corpus_bleu(preds, [labels], BLEU_SMOOTHING_VAL)

    return {'corpus_bleu' + name_suffix: bleu.score}
def run_test(model, test_data, test_iter):
    preds = []

    for k, batch in enumerate(test_iter):
        model.eval()
        with torch.no_grad():
            batch = rebatch(PAD_INDEX, batch)
            pred, attn = greedy_decode(model,
                                       batch.src,
                                       batch.src_mask,
                                       batch.src_lengths,
                                       max_len=25,
                                       sos_index=TRG.vocab.stoi[SOS_TOKEN],
                                       eos_index=TRG.vocab.stoi[EOS_TOKEN])

            preds.append(pred)

    hypotheses = [lookup_words(pred, TRG.vocab) for pred in preds]
    hypotheses = [" ".join(h) for h in hypotheses]
    references = [" ".join(data.trg) for data in test_data]
    bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score
    print("BLEU score: ", bleu)

    return bleu
        print_loss_total += loss.item() / target_sentences.size()[1]

        if i > 0 and i % print_every == 0:
            step_record.append(i + epoch * total_step)
            print('%s (%d %d%%) %.4f' % (timeSince(start, (i+epoch*total_step) /(total_step*EPOCH)), \
                                         i+epoch*total_step, ((i+epoch*total_step) /(total_step*EPOCH))*100, \
                                         print_loss_total/print_every))
            train_loss_record.append(
                [i + epoch * total_step, print_loss_total / print_every])
            print_loss_total = 0
        if i > 0 and i % val_every == 0:
            val_loss = dataset_loss(encoder, decoder, val_loader)
            predict_sentences = evaluate(val_loader, encoder, decoder,
                                         output_lang)
            val_Bleu = raw_corpus_bleu(predict_sentences, [dev_en]).score
            print('step: {}, valNLLLoss: {}, valBleuScore: {}'.format(
                i + epoch * total_step, val_loss, val_Bleu))
            val_loss_record.append(
                [i + epoch * total_step, val_loss, val_Bleu])
            if epoch > 0 and val_Bleu > max_Bleu:
                torch.save(encoder.state_dict(), res_dataDir_cn + \
                           "//zh_tmp_encoder_1210_simple_t2_"+str(EMB_SIZE)+'_'+str(HIDDEN_SIZE)+'_'+str(EPOCH)+".pth")
                torch.save(decoder.state_dict(), res_dataDir_cn + \
                           "//zh_tmp_attn_decoder_1210_simple_t2_"+str(EMB_SIZE)+'_'+str(HIDDEN_SIZE)+'_'+str(EPOCH)+".pth")
            max_Bleu = max(max_Bleu, val_Bleu)
            print('Cache memo after Eva:{}'.format(
                torch.cuda.memory_allocated()))
            # Early Termination
            if epoch > 0 and val_loss_record[-1][2] < max_Bleu-3 and \
                                        val_loss_record[-2][2] < max_Bleu-2 and \
Esempio n. 24
0
def test_statistics(hypothesis, reference, expected_stat):
    result = sacrebleu.raw_corpus_bleu(hypothesis, reference, .01)
    stat = Statistics(result.counts, result.totals)
    assert stat == expected_stat
Esempio n. 25
0
def test_effective_order(hypotheses, references, expected_bleu):
    bleu = sacrebleu.raw_corpus_bleu(hypotheses, references, .01).score / 100
    assert abs(bleu - expected_bleu) < EPSILON
Esempio n. 26
0
train_loss = []
for epoch in range(30):
    total_loss = 0
    for idx, batch_data in enumerate(train_iter):
        input, in_lens, output, out_lens = batch_data
        loss = train_attn(input, output, in_lens, out_lens, encoder, decoder,
                          encoder_optim, decoder_optim, 0.1)
        total_loss += loss
        if idx % 800 == 0:
            print('Training Loss: {}'.format(loss))
    train_loss.append((total_loss / (idx + 1)))
    translated_corp = []
    output_corp = []
    for idx, batch_data in enumerate(valid_iter):
        input, in_lens, output, out_lens = batch_data
        Beam_evaluate(encoder, decoder, batch_data)

    BLEU_score_epo = sacrebleu.raw_corpus_bleu(translated_corp,
                                               [output_corp]).score
    print('Validation Score After Epoch: {}'.format(BLEU_score_epo))
    try:
        if BLEU_score_epo > max(Bleu_score):
            torch.save(encoder.state_dict(), 'encoder_vi.pth')
            torch.save(decoder.state_dict(), 'decoder_vi.pth')
    except:
        torch.save(encoder.state_dict(), 'encoder_vi.pth')
        torch.save(decoder.state_dict(), 'decoder_vi.pth')
    Bleu_score.append(BLEU_score_epo)
    print(train_loss)
    print(Bleu_score)
Esempio n. 27
0
    def evaluate(self,
                 sentences_file="eval_sentences.txt",
                 distance_file="distances.txt",
                 mutual_avg_file="mutual_distances.txt",
                 mutual_avg_file_A="mutual_distances_A.txt",
                 mutual_avg_file_B="mutual_distances_B.txt",
                 top_k_file="top_k.txt",
                 sacre_file="sacre_bleu.tsv"):
        #logging.info("\n\nEvaluating...")

        self.netG_AB.module.eval()
        self.netG_BA.module.eval()
        self.netD_AB.module.eval()
        self.netD_BA.module.eval()

        with torch.no_grad():
            self.forward(
            )  # calculate loss functions, get gradients, update network weights
        gc.collect()
        with open(sentences_file, "a") as sentences_file:
            for j in range(len(self.real_A)):
                str1 = " A->B->A : " + self.real_A[j] + " -> " + self.fake_B[
                    j] + " -> " + self.rec_A[j]
                str2 = " B->A->B : " + self.real_B[j] + " -> " + self.fake_A[
                    j] + " -> " + self.rec_B[j]
                #logging.info(str1)
                #logging.info(str2)
                sentences_file.write('%s\n' % str1)  # save the message
                sentences_file.write('%s\n\n' % str2)  # save the message

        distances = sklearn.metrics.pairwise_distances(
            self.fake_A_embeddings.cpu().detach().numpy(),
            self.fake_B_embeddings.cpu().detach().numpy(),
            metric='cosine',
            n_jobs=-1)

        triang = np.triu(distances)
        np.fill_diagonal(triang, 0)
        mutual_distances = np.sum(triang) / np.count_nonzero(triang)
        with open(mutual_avg_file, "a") as mutual_avg_f:
            mutual_avg_f.write(str(mutual_distances) + '\n')

        with open(distance_file, "a") as distances_file:
            for i in range(len(distances)):
                distances_file.write(str(distances[i][i]) + '\n')

        distances_fake_A = sklearn.metrics.pairwise_distances(
            self.fake_A_embeddings.cpu().detach().numpy(),
            self.fake_A_embeddings.cpu().detach().numpy(),
            metric='cosine',
            n_jobs=-1)
        triang = np.triu(distances_fake_A)
        np.fill_diagonal(triang, 0)
        mutual_distances = np.sum(triang) / np.count_nonzero(triang)
        with open(mutual_avg_file_A, "a") as mutual_avg_f:
            mutual_avg_f.write(str(mutual_distances) + '\n')

        distances_fake_B = sklearn.metrics.pairwise_distances(
            self.fake_B_embeddings.cpu().detach().numpy(),
            self.fake_B_embeddings.cpu().detach().numpy(),
            metric='cosine',
            n_jobs=-1)
        triang = np.triu(distances_fake_B)
        np.fill_diagonal(triang, 0)
        mutual_distances = np.sum(triang) / np.count_nonzero(triang)
        with open(mutual_avg_file_B, "a") as mutual_avg_f:
            mutual_avg_f.write(str(mutual_distances) + '\n')
        #with open(distance_file, "a") as distances_file:
        #   distances_file.write(distances+"\n")

        dim = len(distances)
        top_k = np.zeros(dim, dtype=np.float)
        for i in range(dim):

            lower = 0
            for j in range(len(distances[i])):
                if i != j and distances[i][i] > distances[i][j]:
                    lower += 1
            top_k[lower] += 1

        with open(top_k_file, "a") as top_file:
            tot = 0
            for i in range(dim):
                top_k[i] = top_k[i] / dim * 100
                tot += top_k[i]
                top_file.write('Top ' + str(i + 1) + ': ' + str(tot) + '%\n')
        # mi salvo in un dict per ogni frase quanto lontano è l'embedding reale (quanti ce ne sono più vicini) e faccio una classifica
        # per vedere quanti hanno l'embedding reale nella top 1, top 2 e cosi via (cumulativo)
        # salvo info in un file, per ogni epoca

        bleu_fake_A = sacrebleu.raw_corpus_bleu(self.fake_A,
                                                [self.real_A]).score
        bleu_rec_A = sacrebleu.raw_corpus_bleu(self.rec_A, [self.real_A]).score
        bleu_fake_B = sacrebleu.raw_corpus_bleu(self.fake_B,
                                                [self.real_B]).score
        bleu_rec_B = sacrebleu.raw_corpus_bleu(self.rec_B, [self.real_B]).score

        with open(sacre_file, "a") as sacre_file:
            for i in range(dim):
                sacre_file.write(
                    str(bleu_fake_A) + '\t' + str(bleu_rec_A) + '\t' +
                    str(bleu_fake_B) + '\t' + str(bleu_rec_B) + '\n')

        self.netG_AB.module.train()
        self.netG_BA.module.train()
        self.netD_AB.module.train()
        self.netD_BA.module.train()
        gc.collect()
Esempio n. 28
0
def test_degenerate_uneven(hypotheses, references):
    with pytest.raises(EOFError, match=r'.*stream.*'):
        sacrebleu.raw_corpus_bleu(hypotheses, references)
Esempio n. 29
0
from __future__ import unicode_literals, print_function, division
from io import open
import io
import unicodedata
import string
import re
import random
import numpy as np
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm
from collections import Counter, namedtuple
import pickle
import sacrebleu
import pandas as pd
import string
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD = 0
SOS = 1
EOS = 2
UNK = 3
exclude = set(string.punctuation)
'''
https://stackoverflow.com/questions/4984647/accessing-dict-keys-like-an-attribute
'''
class AttrDict(dict):
    def forward(self,
                rationale_text_label,
                generate,
                q_id,
                question,
                features,
                spatials,
                segment_ids,
                input_mask,
                image_mask,
                co_attention_mask,
                num_options,
                freeze=-1):
        outs = self.vilbert_model(question,
                                  features,
                                  spatials,
                                  segment_ids,
                                  input_mask,
                                  image_mask,
                                  co_attention_mask,
                                  num_options=num_options)

        gpt2_inp, pred_ans = outs[7:]
        gpt2_inp = self.embed(gpt2_inp)
        gpt2_inputs = (gpt2_inp, rationale_text_label)
        gpt2_outputs = self.gpt2_model(gpt2_inputs,
                                       labels=rationale_text_label)
        gpt2_loss = gpt2_outputs[0]

        to_return = outs[:7] + (gpt2_loss, )

        references = []
        hypotheses = []
        if generate:
            out = sample_sequence(
                model=self.gpt2_model,
                context=gpt2_inp,
                length=30,  #TODO 3 * (self._max_caption_length//4
                temperature=1,  #TODO change here
            )

            for i in range(out.size()[0]):
                first_rat = out[i].tolist(
                )  #TODO changed from out[0, len(context_tokens):].tolist()
                text = self.gpt2_tokenizer.decode(
                    first_rat,
                    clean_up_tokenization_spaces=False,
                    skip_special_tokens=True)
                # text = text[: text.find(self.gpt2_tokenizer.stop_token)]

                rationale_text = self.gpt2_tokenizer.decode(
                    rationale_text_label[i].tolist(),
                    clean_up_tokenization_spaces=False,
                    skip_special_tokens=True)
                # rationale_text = rationale_text[: rationale_text.find(self.gpt2_tokenizer)]

                q_id_f = (q_id[i] - 1000000).item()
                pred_ans_f = pred_ans[i].item()
                logger.info(
                    "[Img ID: {}] Predicted Ans: {} \t| Gold rationale: {} | Generated rationale: {}"
                    .format(q_id_f, pred_ans_f, rationale_text, text))

                for rat_ids, gen_ids in zip(rationale_text_label.tolist(),
                                            out.tolist()):
                    rat_dec = self.gpt2_tokenizer.decode(
                        rat_ids,
                        clean_up_tokenization_spaces=False,
                        skip_special_tokens=True)
                    gen_dec = self.gpt2_tokenizer.decode(
                        gen_ids,
                        clean_up_tokenization_spaces=False,
                        skip_special_tokens=True)
                    references.append(rat_dec)
                    hypotheses.append(gen_dec)

            bleu_score = sacrebleu.raw_corpus_bleu(hypotheses, [references],
                                                   .01).score
            to_return = to_return + (bleu_score, )

        return to_return
Esempio n. 31
0
            dev_perplexities.append(dev_perplexity)

    return dev_perplexities


model = make_model(len(SRC.vocab),
                   len(TRG.vocab),
                   emb_size=256,
                   hidden_size=256,
                   num_layers=1,
                   dropout=0.2)
dev_perplexities = train(model, print_every=100)

hypotheses = ["this is a test"]
references = ["this is a test"]
bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score
print(bleu)

hypotheses = ["this is a test"]
references = ["this is a fest"]
bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score
print(bleu)

len(valid_data)

references = [" ".join(example.trg) for example in valid_data]
print(len(references))
print(references[0])

references[-2]