Python bleu Beispiele

Programmiersprache: Python

Namespace / Paketname: joeynmt.metrics

Methode / Funktion: bleu

Beispiele auf hotexamples.com: 12

Python bleu - 12 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die joeynmt.metrics.bleu, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

    def Reward_bleu_fin(self, trg, hyp, show=False):
        """
        To use as self.Reward funtion.
        Return an array of rewards, based on the differences
        of current Blue Score. As proposed on paper.

        :param trg: target.
        :param hyp: the predicted sequence.
        :param show: Boolean, display the computation of the rewards
        :return: current Bleu score
        """
        rew = np.zeros(len(hyp[0]))

        decoded_valid_tar = self.model.trg_vocab.arrays_to_sentences(
            arrays=trg, cut_at_eos=True)
        decoded_valid_hyp = self.model.trg_vocab.arrays_to_sentences(
            arrays=hyp, cut_at_eos=True)

        # evaluate with metric on each src, tar, and hypotesis
        join_char = " " if self.level in ["word", "bpe"] else ""
        valid_references = [join_char.join(t) for t in decoded_valid_tar]
        valid_hypotheses = [join_char.join(t) for t in decoded_valid_hyp]

        # post-process
        if self.level == "bpe":
            valid_references = [bpe_postprocess(v) for v in valid_references]
            valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses]
        # if references are given, evaluate against them
        if valid_references:
            assert len(valid_hypotheses) == len(valid_references)

            current_valid_score = 0
            if self.eval_metric.lower() == 'bleu':
                # this version does not use any tokenization
                #print(' aaa ')
                current_valid_score = bleu(valid_hypotheses, valid_references)
            elif self.eval_metric.lower() == 'chrf':
                current_valid_score = chrf(valid_hypotheses, valid_references)
            elif self.eval_metric.lower() == 'token_accuracy':
                current_valid_score = token_accuracy(valid_hypotheses,
                                                     valid_references,
                                                     level=self.level)
            elif self.eval_metric.lower() == 'sequence_accuracy':
                current_valid_score = sequence_accuracy(
                    valid_hypotheses, valid_references)
        else:
            current_valid_score = -1

        rew[-1] = current_valid_score

        final_rew = rew[1:]
        if show:
            print(
                "\n Sample-------------Target vs Eval_net prediction:--Raw---and---Decoded-----"
            )
            print("Target: ", trg, decoded_valid_tar)
            print("Eval  : ", hyp, decoded_valid_hyp)
            print("Reward: ", final_rew, "\n")

        return final_rew

Beispiel #2

Datei anzeigen

Datei: test_metric.py Projekt: Freshia/joeynmtmaml

 def test_bleu_ja_mecab(self):
     try:
         hyp = ["これはテストです。"]
         ref = ["あれがテストです。"]
         score = bleu(hyp, ref, tokenize="ja-mecab")
         self.assertAlmostEqual(score, 39.764, places=3)
     except Exception as e:
         raise unittest.SkipTest(f"{e} Skip.")

Beispiel #3

Datei anzeigen

    def forward(self, predicted, gold, log_probs):
        """
        Compute the reinforce loss using logprobs and bleu scores

        :param predicted: predicted sentences
        :param gold: gold sentences
        :return: loss, rewards for logging, unscaled rewards for logging
        """
        bleu_scores = [bleu([prediction], [gold_ref]) \
                for prediction, gold_ref in zip(predicted, gold)]
        # save unscaled rewards for logging
        unscaled_rewards = bleu_scores
        if self.reward == "constant":
            bleu_scores = [1 for log_prob in log_probs]
        elif self.reward == "scaled_bleu":

            def scale(reward, a, b, minim, maxim):
                if maxim - minim == 0:
                    return 0
                return (((b - a) * (reward - minim)) / (maxim - minim)) + a

            # local scale
            maxim = max(bleu_scores)
            minim = min(bleu_scores)
            bleu_scores = [scale(score, -0.5, 0.5, minim, maxim) \
                for score in bleu_scores]
        elif self.reward == "bleu":
            if self.baseline == "average_reward_baseline":
                # global average
                self.bleu.extend(bleu_scores)
                average_bleu = np.mean(self.bleu)
                bleu_scores = [score - average_bleu for score in bleu_scores]
        # calculate PG loss with rewards and log probs
        loss = sum([-log_prob*bleu_score \
                for log_prob, bleu_score in zip(log_probs, bleu_scores)])
        return loss, bleu_scores, unscaled_rewards

Beispiel #4

Datei anzeigen

Datei: prediction.py Projekt: strategist922/joeynmt

def validate_on_data(model: Model, data: Dataset,
                     batch_size: int,
                     use_cuda: bool, max_output_length: int,
                     level: str, eval_metric: Optional[str],
                     n_gpu: int,
                     batch_class: Batch = Batch,
                     compute_loss: bool = False,
                     beam_size: int = 1, beam_alpha: int = -1,
                     batch_type: str = "sentence",
                     postprocess: bool = True,
                     bpe_type: str = "subword-nmt",
                     sacrebleu: dict = None,
                     n_best: int = 1) \
        -> (float, float, float, List[str], List[List[str]], List[str],
            List[str], List[List[str]], List[np.array]):
    """
    Generate translations for the given data.
    If `compute_loss` is True and references are given,
    also compute the loss.

    :param model: model module
    :param data: dataset for validation
    :param batch_size: validation batch size
    :param batch_class: class type of batch
    :param use_cuda: if True, use CUDA
    :param max_output_length: maximum length for generated hypotheses
    :param level: segmentation level, one of "char", "bpe", "word"
    :param eval_metric: evaluation metric, e.g. "bleu"
    :param n_gpu: number of GPUs
    :param compute_loss: whether to computes a scalar loss
        for given inputs and targets
    :param beam_size: beam size for validation.
        If <2 then greedy decoding (default).
    :param beam_alpha: beam search alpha for length penalty,
        disabled if set to -1 (default).
    :param batch_type: validation batch type (sentence or token)
    :param postprocess: if True, remove BPE segmentation from translations
    :param bpe_type: bpe type, one of {"subword-nmt", "sentencepiece"}
    :param sacrebleu: sacrebleu options
    :param n_best: Amount of candidates to return

    :return:
        - current_valid_score: current validation score [eval_metric],
        - valid_loss: validation loss,
        - valid_ppl:, validation perplexity,
        - valid_sources: validation sources,
        - valid_sources_raw: raw validation sources (before post-processing),
        - valid_references: validation references,
        - valid_hypotheses: validation_hypotheses,
        - decoded_valid: raw validation hypotheses (before post-processing),
        - valid_attention_scores: attention scores for validation hypotheses
    """
    assert batch_size >= n_gpu, "batch_size must be bigger than n_gpu."
    if sacrebleu is None:  # assign default value
        sacrebleu = {"remove_whitespace": True, "tokenize": "13a"}
    if batch_size > 1000 and batch_type == "sentence":
        logger.warning(
            "WARNING: Are you sure you meant to work on huge batches like "
            "this? 'batch_size' is > 1000 for sentence-batching. "
            "Consider decreasing it or switching to"
            " 'eval_batch_type: token'.")
    valid_iter = make_data_iter(dataset=data,
                                batch_size=batch_size,
                                batch_type=batch_type,
                                shuffle=False,
                                train=False)
    valid_sources_raw = data.src
    pad_index = model.src_vocab.stoi[PAD_TOKEN]
    # disable dropout
    model.eval()
    # don't track gradients during validation
    with torch.no_grad():
        all_outputs = []
        valid_attention_scores = []
        total_loss = 0
        total_ntokens = 0
        total_nseqs = 0
        for valid_batch in iter(valid_iter):
            # run as during training to get validation loss (e.g. xent)

            batch = batch_class(valid_batch, pad_index, use_cuda=use_cuda)
            # sort batch now by src length and keep track of order
            reverse_index = batch.sort_by_src_length()
            sort_reverse_index = expand_reverse_index(reverse_index, n_best)

            # run as during training with teacher forcing
            if compute_loss and batch.trg is not None:
                batch_loss, _, _, _ = model(return_type="loss", **vars(batch))
                if n_gpu > 1:
                    batch_loss = batch_loss.mean()  # average on multi-gpu
                total_loss += batch_loss
                total_ntokens += batch.ntokens
                total_nseqs += batch.nseqs

            # run as during inference to produce translations
            output, attention_scores = run_batch(
                model=model,
                batch=batch,
                beam_size=beam_size,
                beam_alpha=beam_alpha,
                max_output_length=max_output_length,
                n_best=n_best)

            # sort outputs back to original order
            all_outputs.extend(output[sort_reverse_index])
            valid_attention_scores.extend(
                attention_scores[sort_reverse_index]
                if attention_scores is not None else [])

        assert len(all_outputs) == len(data) * n_best

        if compute_loss and total_ntokens > 0:
            # total validation loss
            valid_loss = total_loss
            # exponent of token-level negative log prob
            valid_ppl = torch.exp(total_loss / total_ntokens)
        else:
            valid_loss = -1
            valid_ppl = -1

        # decode back to symbols
        decoded_valid = model.trg_vocab.arrays_to_sentences(arrays=all_outputs,
                                                            cut_at_eos=True)

        # evaluate with metric on full dataset
        join_char = " " if level in ["word", "bpe"] else ""
        valid_sources = [join_char.join(s) for s in data.src]
        valid_references = [join_char.join(t) for t in data.trg]
        valid_hypotheses = [join_char.join(t) for t in decoded_valid]

        # post-process
        if level == "bpe" and postprocess:
            valid_sources = [
                bpe_postprocess(s, bpe_type=bpe_type) for s in valid_sources
            ]
            valid_references = [
                bpe_postprocess(v, bpe_type=bpe_type) for v in valid_references
            ]
            valid_hypotheses = [
                bpe_postprocess(v, bpe_type=bpe_type) for v in valid_hypotheses
            ]

        # if references are given, evaluate against them
        if valid_references:
            assert len(valid_hypotheses) == len(valid_references)

            current_valid_score = 0
            if eval_metric.lower() == 'bleu':
                # this version does not use any tokenization
                current_valid_score = bleu(valid_hypotheses,
                                           valid_references,
                                           tokenize=sacrebleu["tokenize"])
            elif eval_metric.lower() == 'chrf':
                current_valid_score = chrf(
                    valid_hypotheses,
                    valid_references,
                    remove_whitespace=sacrebleu["remove_whitespace"])
            elif eval_metric.lower() == 'token_accuracy':
                current_valid_score = token_accuracy(  # supply List[List[str]]
                    list(decoded_valid), list(data.trg))
            elif eval_metric.lower() == 'sequence_accuracy':
                current_valid_score = sequence_accuracy(
                    valid_hypotheses, valid_references)
        else:
            current_valid_score = -1

    return current_valid_score, valid_loss, valid_ppl, valid_sources, \
        valid_sources_raw, valid_references, valid_hypotheses, \
        decoded_valid, valid_attention_scores

Beispiel #5

Datei anzeigen

Datei: prediction.py Projekt: flenniffer/joeynmt

def validate_on_data(model: Model, data: Dataset,
                     logger: Logger,
                     batch_size: int,
                     use_cuda: bool, max_output_length: int,
                     level: str, eval_metric: Optional[str],
                     loss_function: torch.nn.Module = None,
                     beam_size: int = 1, beam_alpha: int = -1,
                     batch_type: str = "sentence",
                     postprocess: bool = True
                     ) \
        -> (float, float, float, List[str], List[List[str]], List[str],
            List[str], List[List[str]], List[np.array]):
    """
    Generate translations for the given data.
    If `loss_function` is not None and references are given,
    also compute the loss.

    :param model: model module
    :param logger: logger
    :param data: dataset for validation
    :param batch_size: validation batch size
    :param use_cuda: if True, use CUDA
    :param max_output_length: maximum length for generated hypotheses
    :param level: segmentation level, one of "char", "bpe", "word"
    :param eval_metric: evaluation metric, e.g. "bleu"
    :param loss_function: loss function that computes a scalar loss
        for given inputs and targets
    :param beam_size: beam size for validation.
        If <2 then greedy decoding (default).
    :param beam_alpha: beam search alpha for length penalty,
        disabled if set to -1 (default).
    :param batch_type: validation batch type (sentence or token)
    :param postprocess: if True, remove BPE segmentation from translations

    :return:
        - current_valid_score: current validation score [eval_metric],
        - valid_loss: validation loss,
        - valid_ppl:, validation perplexity,
        - valid_sources: validation sources,
        - valid_sources_raw: raw validation sources (before post-processing),
        - valid_references: validation references,
        - valid_hypotheses: validation_hypotheses,
        - decoded_valid: raw validation hypotheses (before post-processing),
        - valid_attention_scores: attention scores for validation hypotheses
    """
    if batch_size > 1000 and batch_type == "sentence":
        logger.warning(
            "WARNING: Are you sure you meant to work on huge batches like "
            "this? 'batch_size' is > 1000 for sentence-batching. "
            "Consider decreasing it or switching to"
            " 'eval_batch_type: token'.")
    valid_iter = make_data_iter(dataset=data,
                                batch_size=batch_size,
                                batch_type=batch_type,
                                shuffle=False,
                                train=False)
    valid_sources_raw = data.src
    pad_index = model.src_vocab.stoi[PAD_TOKEN]
    # disable dropout
    model.eval()
    # don't track gradients during validation
    with torch.no_grad():
        all_outputs = []
        valid_attention_scores = []
        total_loss = 0
        total_ntokens = 0
        total_nseqs = 0
        for valid_batch in iter(valid_iter):
            # run as during training to get validation loss (e.g. xent)

            batch = Batch(valid_batch, pad_index, use_cuda=use_cuda)
            # sort batch now by src length and keep track of order
            sort_reverse_index = batch.sort_by_src_lengths()

            # run as during training with teacher forcing
            if loss_function is not None and batch.trg is not None:
                batch_loss = model.get_loss_for_batch(
                    batch, loss_function=loss_function)
                total_loss += batch_loss
                total_ntokens += batch.ntokens
                total_nseqs += batch.nseqs

            # run as during inference to produce translations
            output, attention_scores = model.run_batch(
                batch=batch,
                beam_size=beam_size,
                beam_alpha=beam_alpha,
                max_output_length=max_output_length)

            # sort outputs back to original order
            all_outputs.extend(output[sort_reverse_index])
            valid_attention_scores.extend(
                attention_scores[sort_reverse_index]
                if attention_scores is not None else [])

        assert len(all_outputs) == len(data)

        if loss_function is not None and total_ntokens > 0:
            # total validation loss
            valid_loss = total_loss
            # exponent of token-level negative log prob
            valid_ppl = torch.exp(total_loss / total_ntokens)
        else:
            valid_loss = -1
            valid_ppl = -1

        # decode back to symbols
        decoded_valid = model.trg_vocab.arrays_to_sentences(arrays=all_outputs,
                                                            cut_at_eos=True)

        # evaluate with metric on full dataset
        join_char = " " if level in ["word", "bpe"] else ""
        valid_sources = [join_char.join(s) for s in data.src]
        valid_references = [join_char.join(t) for t in data.trg]
        valid_hypotheses = [join_char.join(t) for t in decoded_valid]

        # post-process
        if level == "bpe" and postprocess:
            valid_sources = [bpe_postprocess(s) for s in valid_sources]
            valid_references = [bpe_postprocess(v) for v in valid_references]
            valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses]

        # if references are given, evaluate against them
        if valid_references:
            assert len(valid_hypotheses) == len(valid_references)

            current_valid_score = 0
            if eval_metric.lower() == 'bleu':
                # this version does not use any tokenization
                current_valid_score = bleu(valid_hypotheses, valid_references)
            elif eval_metric.lower() == 'chrf':
                current_valid_score = chrf(valid_hypotheses, valid_references)
            elif eval_metric.lower() == 'token_accuracy':
                current_valid_score = token_accuracy(valid_hypotheses,
                                                     valid_references,
                                                     level=level)
            elif eval_metric.lower() == 'sequence_accuracy':
                current_valid_score = sequence_accuracy(
                    valid_hypotheses, valid_references)
        else:
            current_valid_score = -1

    return current_valid_score, valid_loss, valid_ppl, valid_sources, \
        valid_sources_raw, valid_references, valid_hypotheses, \
        decoded_valid, valid_attention_scores

Beispiel #6

Datei anzeigen

def validate_on_data(model,
                     data,
                     batch_size,
                     use_cuda,
                     max_output_length,
                     level,
                     eval_metric,
                     criterion,
                     beam_size=0,
                     beam_alpha=-1):
    """
    Generate translations for the given data.
    If `criterion` is not None and references are given, also compute the loss.

    :param model:
    :param data:
    :param batch_size:
    :param use_cuda:
    :param max_output_length:
    :param level:
    :param eval_metric:
    :param criterion:
    :param beam_size:
    :param beam_alpha:
    :return:
    """
    valid_iter = make_data_iter(dataset=data,
                                batch_size=batch_size,
                                shuffle=False,
                                train=False)
    valid_sources_raw = [s for s in data.src]
    pad_index = model.src_vocab.stoi[PAD_TOKEN]
    # disable dropout
    model.eval()
    # don't track gradients during validation
    with torch.no_grad():
        all_outputs = []
        valid_attention_scores = []
        total_loss = 0
        total_ntokens = 0
        for valid_i, valid_batch in enumerate(iter(valid_iter), 1):
            # run as during training to get validation loss (e.g. xent)

            batch = Batch(valid_batch, pad_index, use_cuda=use_cuda)
            # sort batch now by src length and keep track of order
            sort_reverse_index = batch.sort_by_src_lengths()

            # TODO save computation: forward pass is computed twice
            # run as during training with teacher forcing
            if criterion is not None and batch.trg is not None:
                batch_loss = model.get_loss_for_batch(batch,
                                                      criterion=criterion)
                total_loss += batch_loss
                total_ntokens += batch.ntokens

            # run as during inference to produce translations
            output, attention_scores = model.run_batch(
                batch=batch,
                beam_size=beam_size,
                beam_alpha=beam_alpha,
                max_output_length=max_output_length)

            # sort outputs back to original order
            all_outputs.extend(output[sort_reverse_index])
            valid_attention_scores.extend(
                attention_scores[sort_reverse_index]
                if attention_scores is not None else [])

        assert len(all_outputs) == len(data)

        if criterion is not None and total_ntokens > 0:
            # total validation loss
            valid_loss = total_loss
            # exponent of token-level negative log prob
            valid_ppl = torch.exp(total_loss / total_ntokens)
        else:
            valid_loss = -1
            valid_ppl = -1

        # decode back to symbols
        decoded_valid = arrays_to_sentences(arrays=all_outputs,
                                            vocabulary=model.trg_vocab,
                                            cut_at_eos=True)

        # evaluate with metric on full dataset
        join_char = " " if level in ["word", "bpe"] else ""
        valid_sources = [join_char.join(s) for s in data.src]
        valid_references = [join_char.join(t) for t in data.trg]
        valid_hypotheses = [join_char.join(t) for t in decoded_valid]

        # post-process
        if level == "bpe":
            valid_sources = [bpe_postprocess(s) for s in valid_sources]
            valid_references = [bpe_postprocess(v) for v in valid_references]
            valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses]

        # if references are given, evaluate against them
        if len(valid_references) > 0:
            assert len(valid_hypotheses) == len(valid_references)

            current_valid_score = 0
            if eval_metric.lower() == 'bleu':
                # this version does not use any tokenization
                current_valid_score = bleu(valid_hypotheses, valid_references)
            elif eval_metric.lower() == 'chrf':
                current_valid_score = chrf(valid_hypotheses, valid_references)
            elif eval_metric.lower() == 'token_accuracy':
                current_valid_score = token_accuracy(valid_hypotheses,
                                                     valid_references,
                                                     level=level)
            elif eval_metric.lower() == 'sequence_accuracy':
                current_valid_score = sequence_accuracy(
                    valid_hypotheses, valid_references)
        else:
            current_valid_score = -1

    return current_valid_score, valid_loss, valid_ppl, valid_sources, \
           valid_sources_raw, valid_references, valid_hypotheses, \
           decoded_valid, \
           valid_attention_scores

Beispiel #7

Datei anzeigen

    def dev_network(self):
        """
        Show how is the current performace over the dev dataset, by mean of the
        total reward and the belu score.
        
        :return: current Bleu score
        """
        freeze_model(self.eval_net)
        for data_set_name, data_set in self.data_to_dev.items():
            #print(data_set_name)
            valid_iter = make_data_iter(dataset=data_set,
                                        batch_size=1,
                                        batch_type=self.batch_type,
                                        shuffle=False,
                                        train=False)
            valid_sources_raw = data_set.src

            # don't track gradients during validation
            r_total = 0
            roptimal_total = 0
            all_outputs = []
            i_sample = 0

            for valid_batch in iter(valid_iter):
                # run as during training to get validation loss (e.g. xent)

                batch = Batch(valid_batch,
                              self.pad_index,
                              use_cuda=self.use_cuda)

                encoder_output, encoder_hidden = self.model.encode(
                    batch.src, batch.src_lengths, batch.src_mask)

                # if maximum output length is
                # not globally specified, adapt to src len
                if self.max_output_length is None:
                    self.max_output_length = int(
                        max(batch.src_lengths.cpu().numpy()) * 1.5)

                batch_size = batch.src_mask.size(0)
                prev_y = batch.src_mask.new_full(size=[batch_size, 1],
                                                 fill_value=self.bos_index,
                                                 dtype=torch.long)
                output = []
                hidden = self.model.decoder._init_hidden(encoder_hidden)
                prev_att_vector = None
                finished = batch.src_mask.new_zeros((batch_size, 1)).byte()

                # pylint: disable=unused-variable
                for t in range(self.max_output_length):

                    # if i_sample == 0 or i_sample == 3 or i_sample == 6:
                    #     print("state on t = ", t, " : " , state)

                    # decode one single step
                    logits, hidden, att_probs, prev_att_vector = self.model.decoder(
                        encoder_output=encoder_output,
                        encoder_hidden=encoder_hidden,
                        src_mask=batch.src_mask,
                        trg_embed=self.model.trg_embed(prev_y),
                        hidden=hidden,
                        prev_att_vector=prev_att_vector,
                        unroll_steps=1)
                    # greedy decoding: choose arg max over vocabulary in each step with egreedy porbability

                    if self.state_type == 'hidden':
                        state = torch.cat(hidden,
                                          dim=2).squeeze(1).detach().cpu()[0]
                    else:
                        state = torch.FloatTensor(
                            prev_att_vector.squeeze(1).detach().cpu().numpy()
                            [0])

                    logits = self.eval_net(state)
                    logits = logits.reshape([1, 1, -1])
                    #print(type(logits), logits.shape, logits)
                    next_word = torch.argmax(logits, dim=-1)
                    a = next_word.squeeze(1).detach().cpu().numpy()[0]
                    prev_y = next_word

                    output.append(next_word.squeeze(1).detach().cpu().numpy())
                    prev_y = next_word

                    # check if previous symbol was <eos>
                    is_eos = torch.eq(next_word, self.eos_index)
                    finished += is_eos
                    # stop predicting if <eos> reached for all elements in batch
                    if (finished >= 1).sum() == batch_size:
                        break
                stacked_output = np.stack(output, axis=1)  # batch, time

                #decode back to symbols
                decoded_valid_in = self.model.trg_vocab.arrays_to_sentences(
                    arrays=batch.src, cut_at_eos=True)
                decoded_valid_out_trg = self.model.trg_vocab.arrays_to_sentences(
                    arrays=batch.trg, cut_at_eos=True)
                decoded_valid_out = self.model.trg_vocab.arrays_to_sentences(
                    arrays=stacked_output, cut_at_eos=True)

                hyp = stacked_output

                r = self.Reward(batch.trg, hyp, show=False)

                if i_sample == 0 or i_sample == 3 or i_sample == 6:
                    print(
                        "\n Sample ", i_sample,
                        "-------------Target vs Eval_net prediction:--Raw---and---Decoded-----"
                    )
                    print("Target: ", batch.trg, decoded_valid_out_trg)
                    print("Eval  : ", stacked_output, decoded_valid_out, "\n")
                    print("Reward: ", r)

                #r = self.Reward1(batch.trg, hyp , show = False)
                r_total += sum(r[np.where(r > 0)])
                if i_sample == 0:
                    roptimal = self.Reward(batch.trg, batch.trg, show=False)
                    roptimal_total += sum(roptimal[np.where(roptimal > 0)])

                all_outputs.extend(stacked_output)
                i_sample += 1

            assert len(all_outputs) == len(data_set)

            # decode back to symbols
            decoded_valid = self.model.trg_vocab.arrays_to_sentences(
                arrays=all_outputs, cut_at_eos=True)

            # evaluate with metric on full dataset
            join_char = " " if self.level in ["word", "bpe"] else ""
            valid_sources = [join_char.join(s) for s in data_set.src]
            valid_references = [join_char.join(t) for t in data_set.trg]
            valid_hypotheses = [join_char.join(t) for t in decoded_valid]

            # post-process
            if self.level == "bpe":
                valid_sources = [bpe_postprocess(s) for s in valid_sources]
                valid_references = [
                    bpe_postprocess(v) for v in valid_references
                ]
                valid_hypotheses = [
                    bpe_postprocess(v) for v in valid_hypotheses
                ]

            # if references are given, evaluate against them
            if valid_references:
                assert len(valid_hypotheses) == len(valid_references)

                current_valid_score = 0
                if self.eval_metric.lower() == 'bleu':
                    # this version does not use any tokenization
                    current_valid_score = bleu(valid_hypotheses,
                                               valid_references)
                elif self.eval_metric.lower() == 'chrf':
                    current_valid_score = chrf(valid_hypotheses,
                                               valid_references)
                elif self.eval_metric.lower() == 'token_accuracy':
                    current_valid_score = token_accuracy(valid_hypotheses,
                                                         valid_references,
                                                         level=self.level)
                elif self.eval_metric.lower() == 'sequence_accuracy':
                    current_valid_score = sequence_accuracy(
                        valid_hypotheses, valid_references)
            else:
                current_valid_score = -1

            self.dev_network_count += 1
            self.tb_writer.add_scalar("dev/dev_reward", r_total,
                                      self.dev_network_count)
            self.tb_writer.add_scalar("dev/dev_bleu", current_valid_score,
                                      self.dev_network_count)

            print(self.dev_network_count, ' r_total and score: ', r_total,
                  current_valid_score)

            unfreeze_model(self.eval_net)
        return current_valid_score

Beispiel #8

Datei anzeigen

    def Reward_lin(self, trg, hyp, show=False):
        """
        To use as self.Reward funtion. 
        Return an array of rewards, based on the current Score.
        From a T predicted sequence. Gives a reward per each T steps.
        Just when the predicted word is on the right place.

        :param trg: target.
        :param hyp: the predicted sequence.
        :param show: Boolean, display the computation of the rewards
        :return: current Bleu score
        """

        tar_len = trg.shape[1]
        hyp_len = hyp.shape[1]

        final_rew = -1 * np.ones(hyp_len - 1)

        len_temp = 0
        if tar_len > hyp_len:
            len_temp = hyp_len
        else:
            len_temp = tar_len
        hyp2com = np.zeros([1, tar_len])
        hyp2com[0, :len_temp] = hyp[0, :len_temp]

        equal = (trg.numpy() == hyp2com)

        #equal = np.invert(equal)*np.ones(equal.size)*0.2
        # ind1, ind2 = np.where(equal == False)

        # if len(ind1) != 0:
        #     equal[ind1[0]:, ind2[0]:] = False

        decoded_valid_tar = self.model.trg_vocab.arrays_to_sentences(
            arrays=trg, cut_at_eos=True)
        decoded_valid_hyp = self.model.trg_vocab.arrays_to_sentences(
            arrays=hyp, cut_at_eos=True)

        if show:
            print('la lista trg-out decodificada: ', decoded_valid_tar)
            print('la lista hypotesis decodificada: ', decoded_valid_hyp)

        # evaluate with metric on each src, tar, and hypotesis
        join_char = " " if self.level in ["word", "bpe"] else ""
        valid_references = [join_char.join(t) for t in decoded_valid_tar]
        valid_hypotheses = [join_char.join(t) for t in decoded_valid_hyp]

        # post-process
        if self.level == "bpe":
            valid_references = [bpe_postprocess(v) for v in valid_references]
            valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses]
        # if references are given, evaluate against them
        if valid_references:
            assert len(valid_hypotheses) == len(valid_references)

            current_valid_score = 0
            if self.eval_metric.lower() == 'bleu':
                # this version does not use any tokenization
                current_valid_score = bleu(valid_hypotheses, valid_references)
            elif self.eval_metric.lower() == 'chrf':
                current_valid_score = chrf(valid_hypotheses, valid_references)
            elif self.eval_metric.lower() == 'token_accuracy':
                current_valid_score = token_accuracy(valid_hypotheses,
                                                     valid_references,
                                                     level=self.level)
            elif self.eval_metric.lower() == 'sequence_accuracy':
                current_valid_score = sequence_accuracy(
                    valid_hypotheses, valid_references)
        else:
            current_valid_score = -1

        k = sum(np.arange(tar_len))
        a_i = np.arange(1, tar_len) / k
        VSa_i = [sum(a_i[:i]) for i in np.arange(1, tar_len, dtype='int')]
        VSa_i = np.multiply(
            np.asanyarray(VSa_i).reshape([1, tar_len - 1]),
            equal).reshape([tar_len - 1])

        final_rew[:len_temp - 1] = np.multiply(VSa_i,
                                               current_valid_score)[:len_temp]

        if show:
            print('Reward is: ', final_rew)
            print('sum: ', sum(final_rew))
        return final_rew

Beispiel #9

Datei anzeigen

Datei: test_metric.py Projekt: Freshia/joeynmtmaml

 def test_bleu_13a(self):
     hyp = ["this is a test."]
     ref = ["this is a tezt."]
     score = bleu(hyp, ref, tokenize="13a")
     self.assertAlmostEqual(score, 42.729, places=3)

Beispiel #10

Datei anzeigen

Datei: prediction.py Projekt: marvosyntactical/joeynmt

def validate_on_data(model: Model,
                     data: Dataset,
                     batch_size: int,
                     use_cuda: bool,
                     max_output_length: int,
                     level: str,
                     eval_metric: Optional[str],
                     loss_function: torch.nn.Module = None,
                     beam_size: int = 0,
                     beam_alpha: int = -1,
                     batch_type: str = "sentence",
                     kb_task = None,
                     valid_kb: Dataset = None,
                     valid_kb_lkp: list = [],
                     valid_kb_lens:list=[],
                     valid_kb_truvals: Dataset = None,
                     valid_data_canon: Dataset = None,
                     report_on_canonicals: bool = False,
                     ) \
        -> (float, float, float, List[str], List[List[str]], List[str],
            List[str], List[List[str]], List[np.array]):
    """
    Generate translations for the given data.
    If `loss_function` is not None and references are given,
    also compute the loss.

    :param model: model module
    :param data: dataset for validation
    :param batch_size: validation batch size
    :param use_cuda: if True, use CUDA
    :param max_output_length: maximum length for generated hypotheses
    :param level: segmentation level, one of "char", "bpe", "word"
    :param eval_metric: evaluation metric, e.g. "bleu"
    :param loss_function: loss function that computes a scalar loss
        for given inputs and targets
    :param beam_size: beam size for validation.
        If 0 then greedy decoding (default).
    :param beam_alpha: beam search alpha for length penalty,
        disabled if set to -1 (default).
    :param batch_type: validation batch type (sentence or token)
    :param kb_task: is not None if kb_task should be executed
    :param valid_kb: MonoDataset holding the loaded valid kb data
    :param valid_kb_lkp: List with valid example index to corresponding kb indices
    :param valid_kb_len: List with amount of triples per kb 
    :param valid_data_canon: TranslationDataset of valid data but with canonized target data (for loss reporting)


    :return:
        - current_valid_score: current validation score [eval_metric],
        - valid_loss: validation loss,
        - valid_ppl:, validation perplexity,
        - valid_sources: validation sources,
        - valid_sources_raw: raw validation sources (before post-processing),
        - valid_references: validation references,
        - valid_hypotheses: validation_hypotheses,
        - decoded_valid: raw validation hypotheses (before post-processing),
        - valid_attention_scores: attention scores for validation hypotheses
        - valid_ent_f1: TODO FIXME
    """

    print(f"\n{'-'*10} ENTER VALIDATION {'-'*10}\n")

    print(f"\n{'-'*10}  VALIDATION DEBUG {'-'*10}\n")

    print("---data---")
    print(dir(data[0]))
    print([[
        getattr(example, attr) for attr in dir(example)
        if hasattr(getattr(example, attr), "__iter__") and "kb" in attr
        or "src" in attr or "trg" in attr
    ] for example in data[:3]])
    print(batch_size)
    print(use_cuda)
    print(max_output_length)
    print(level)
    print(eval_metric)
    print(loss_function)
    print(beam_size)
    print(beam_alpha)
    print(batch_type)
    print(kb_task)
    print("---valid_kb---")
    print(dir(valid_kb[0]))
    print([[
        getattr(example, attr) for attr in dir(example)
        if hasattr(getattr(example, attr), "__iter__") and "kb" in attr
        or "src" in attr or "trg" in attr
    ] for example in valid_kb[:3]])
    print(len(valid_kb_lkp), valid_kb_lkp[-5:])
    print(len(valid_kb_lens), valid_kb_lens[-5:])
    print("---valid_kb_truvals---")
    print(len(valid_kb_truvals), valid_kb_lens[-5:])
    print([[
        getattr(example, attr) for attr in dir(example)
        if hasattr(getattr(example, attr), "__iter__") and "kb" in attr
        or "src" in attr or "trg" in attr or "trv" in attr
    ] for example in valid_kb_truvals[:3]])
    print("---valid_data_canon---")
    print(len(valid_data_canon), valid_data_canon[-5:])
    print([[
        getattr(example, attr) for attr in dir(example)
        if hasattr(getattr(example, attr), "__iter__") and "kb" in attr
        or "src" in attr or "trg" in attr or "trv" or "can" in attr
    ] for example in valid_data_canon[:3]])
    print(report_on_canonicals)

    print(f"\n{'-'*10} END VALIDATION DEBUG {'-'*10}\n")

    if not kb_task:
        valid_iter = make_data_iter(dataset=data,
                                    batch_size=batch_size,
                                    batch_type=batch_type,
                                    shuffle=False,
                                    train=False)
    else:
        # knowledgebase version of make data iter and also provide canonized target data
        # data: for bleu/ent f1
        # canon_data: for loss
        valid_iter = make_data_iter_kb(data,
                                       valid_kb,
                                       valid_kb_lkp,
                                       valid_kb_lens,
                                       valid_kb_truvals,
                                       batch_size=batch_size,
                                       batch_type=batch_type,
                                       shuffle=False,
                                       train=False,
                                       canonize=model.canonize,
                                       canon_data=valid_data_canon)

    valid_sources_raw = data.src
    pad_index = model.src_vocab.stoi[PAD_TOKEN]

    # disable dropout
    model.eval()
    # don't track gradients during validation
    with torch.no_grad():
        all_outputs = []
        valid_attention_scores = []
        valid_kb_att_scores = []
        total_loss = 0
        total_ntokens = 0
        total_nseqs = 0
        for valid_batch in iter(valid_iter):
            # run as during training to get validation loss (e.g. xent)

            batch = Batch(valid_batch, pad_index, use_cuda=use_cuda) \
                                if not kb_task else \
                Batch_with_KB(valid_batch, pad_index, use_cuda=use_cuda)

            assert hasattr(batch, "kbsrc") == bool(kb_task)

            # sort batch now by src length and keep track of order
            if not kb_task:
                sort_reverse_index = batch.sort_by_src_lengths()
            else:
                sort_reverse_index = list(range(batch.src.shape[0]))

            # run as during training with teacher forcing
            if loss_function is not None and batch.trg is not None:

                ntokens = batch.ntokens
                if hasattr(batch, "trgcanon") and batch.trgcanon is not None:
                    ntokens = batch.ntokenscanon  # normalize loss with num canonical tokens for perplexity
                # do a loss calculation without grad updates just to report valid loss
                # we can only do this when batch.trg exists, so not during actual translation/deployment
                batch_loss = model.get_loss_for_batch(
                    batch, loss_function=loss_function)
                # keep track of metrics for reporting
                total_loss += batch_loss
                total_ntokens += ntokens  # gold target tokens
                total_nseqs += batch.nseqs

            # run as during inference to produce translations
            output, attention_scores, kb_att_scores = model.run_batch(
                batch=batch,
                beam_size=beam_size,
                beam_alpha=beam_alpha,
                max_output_length=max_output_length)

            # sort outputs back to original order
            all_outputs.extend(output[sort_reverse_index])
            valid_attention_scores.extend(
                attention_scores[sort_reverse_index]
                if attention_scores is not None else [])
            valid_kb_att_scores.extend(kb_att_scores[sort_reverse_index]
                                       if kb_att_scores is not None else [])

        assert len(all_outputs) == len(data)

        if loss_function is not None and total_ntokens > 0:
            # total validation loss
            valid_loss = total_loss
            # exponent of token-level negative log likelihood
            # can be seen as 2^(cross_entropy of model on valid set); normalized by num tokens;
            # see https://en.wikipedia.org/wiki/Perplexity#Perplexity_per_word
            valid_ppl = torch.exp(valid_loss / total_ntokens)
        else:
            valid_loss = -1
            valid_ppl = -1

        # decode back to symbols
        decoding_vocab = model.trg_vocab if not kb_task else model.trv_vocab

        decoded_valid = decoding_vocab.arrays_to_sentences(arrays=all_outputs,
                                                           cut_at_eos=True)

        print(f"decoding_vocab.itos: {decoding_vocab.itos}")
        print(decoded_valid)

        # evaluate with metric on full dataset
        join_char = " " if level in ["word", "bpe"] else ""
        valid_sources = [join_char.join(s) for s in data.src]
        # TODO replace valid_references with uncanonicalized dev.car data ... requires writing new Dataset in data.py
        valid_references = [join_char.join(t) for t in data.trg]
        valid_hypotheses = [join_char.join(t) for t in decoded_valid]

        # post-process
        if level == "bpe":
            valid_sources = [bpe_postprocess(s) for s in valid_sources]
            valid_references = [bpe_postprocess(v) for v in valid_references]
            valid_hypotheses = [bpe_postprocess(v) for v in valid_hypotheses]

        # if references are given, evaluate against them
        if valid_references:
            assert len(valid_hypotheses) == len(valid_references)

            print(list(zip(valid_sources, valid_references, valid_hypotheses)))

            current_valid_score = 0
            if eval_metric.lower() == 'bleu':
                # this version does not use any tokenization
                current_valid_score = bleu(valid_hypotheses, valid_references)
            elif eval_metric.lower() == 'chrf':
                current_valid_score = chrf(valid_hypotheses, valid_references)
            elif eval_metric.lower() == 'token_accuracy':
                current_valid_score = token_accuracy(valid_hypotheses,
                                                     valid_references,
                                                     level=level)
            elif eval_metric.lower() == 'sequence_accuracy':
                current_valid_score = sequence_accuracy(
                    valid_hypotheses, valid_references)

            if kb_task:
                valid_ent_f1, valid_ent_mcc = calc_ent_f1_and_ent_mcc(
                    valid_hypotheses,
                    valid_references,
                    vocab=model.trv_vocab,
                    c_fun=model.canonize,
                    report_on_canonicals=report_on_canonicals)

            else:
                valid_ent_f1, valid_ent_mcc = -1, -1
        else:
            current_valid_score = -1

    print(f"\n{'-'*10} EXIT VALIDATION {'-'*10}\n")
    return current_valid_score, valid_loss, valid_ppl, valid_sources, \
        valid_sources_raw, valid_references, valid_hypotheses, \
        decoded_valid, valid_attention_scores, valid_kb_att_scores, \
        valid_ent_f1, valid_ent_mcc

Beispiel #11

Datei anzeigen

    def ned_a2c(self,
                max_output_length,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                src_length: Tensor,
                temperature: float,
                critic: nn.Module,
                topk: int,
                log_probabilities=False,
                pickle_logs=False):
        """ Computes forward pass for NED-A2C
        
        Encodes source, step by step decodes and samples actor output.
        For each step decodes critic output given actor outputs as target
        Computes actor loss and critic loss

        :param max_output_length: max output length
        :param src: source input
        :param trg: target input
        :param src_mask: source mask
        :param src_length: length of source inputs
        :param temperature: softmax temperature
        :param critic: critic network
        :param topk: consider top-k parameters for logging
        :param log_probabilities: log probabilities
        :return: actor loss, critic loss, actor probability logs
        """

        if max_output_length is None:
            max_output_length = int(max(src_length.cpu().numpy()) * 1.5)
        batch_size = src_mask.size(0)
        trg_mask = src_mask.new_ones([1, 1, 1])
        # init actor parameters
        encoder_output, encoder_hidden = self._encode(src, src_length,
                                                      src_mask)
        hidden = (self.decoder._init_hidden(encoder_hidden)) \
            if hasattr(self.decoder,'_init_hidden') else (0,0)
        attention_vectors = None
        ys = encoder_output.new_full([batch_size, 1],
                                     self.bos_index,
                                     dtype=torch.long)
        log_probs = 0
        distributions = []
        actor_log_probabs = []
        # init critic parameters
        critic_encoder_output, critic_encoder_hidden = critic._encode(
            src, src_length, src_mask)
        critic_hidden = (self.decoder._init_hidden(critic_encoder_hidden)) \
            if hasattr(self.decoder,'_init_hidden') else (0,0)
        critic_logits = []
        critic_sequence = critic_encoder_output.new_full(
            size=[batch_size, 1], fill_value=self.bos_index, dtype=torch.long)
        critic_attention_vectors = None
        # init dict to track eos
        eos_dict = {i: -1 for i in range(batch_size)}
        finished = src_mask.new_zeros((batch_size)).byte()
        # decode with actor
        for i in range(max_output_length):
            previous_words = ys[:, -1].view(-1, 1) if hasattr(
                self.decoder, '_init_hidden') else ys
            logits, hidden, _, attention_vectors = self.decoder(
                trg_embed=self.trg_embed(previous_words),
                encoder_output=encoder_output,
                encoder_hidden=encoder_hidden,
                src_mask=src_mask,
                unroll_steps=1,
                hidden=hidden,
                prev_att_vector=attention_vectors,
                trg_mask=trg_mask)
            logits = logits[:, -1] / temperature
            distrib = Categorical(logits=logits)
            distributions.append(distrib)
            sampled_word = distrib.sample()
            log_probs -= distrib.log_prob(sampled_word)
            ys = torch.cat([ys, sampled_word.unsqueeze(-1)], dim=1)
            actor_log_probabs.append(log_probs)
            sampled_word_list = sampled_word.tolist()
            for index in range(len(sampled_word_list)):
                if sampled_word_list[index] == self.eos_index:
                    if eos_dict[index] == -1:
                        eos_dict[index] = i
            # decode with critic, using actor as target
            critic_logit, critic_hidden, critic_attention_scores, critic_attention_vectors = critic.decoder(
                trg_embed=self.trg_embed(sampled_word.view(-1, 1)),
                encoder_output=critic_encoder_output,
                encoder_hidden=critic_encoder_hidden,
                src_mask=src_mask,
                unroll_steps=1,
                hidden=critic_hidden,
                prev_att_vector=critic_attention_vectors,
                trg_mask=trg_mask)
            critic_logits.append(critic_logit)
            critic_distrib = Categorical(
                logits=critic_logit.view(-1, critic_logit.size(-1)))
            critic_sample = critic_distrib.sample()
            critic_sequence = torch.cat(
                [critic_sequence, critic_sample.view(-1, 1)], -1)
            # prevent early stopping in decoding when logging gold token
            if not pickle_logs:
                # check if previous symbol was <eos>
                is_eos = torch.eq(sampled_word, self.eos_index)
                finished += is_eos
                # stop predicting if <eos> reached for all elements in batch
                if (finished >= 1).sum() == batch_size:
                    break
        ys = ys[:, 1:]
        critic_sequence = critic_sequence[:, 1:]
        predicted_output = self.trg_vocab.arrays_to_sentences(arrays=ys,
                                                              cut_at_eos=True)
        gold_output = self.trg_vocab.arrays_to_sentences(arrays=trg,
                                                         cut_at_eos=True)
        predicted_strings = [
            join_strings(wordlist) for wordlist in predicted_output
        ]
        gold_strings = [join_strings(wordlist) for wordlist in gold_output]
        # calculate rewards
        bleu_scores = []
        for prediction, gold_ref in zip(predicted_strings, gold_strings):
            bleu_scores.append(bleu([prediction], [gold_ref]))
        bleu_tensor = torch.FloatTensor(bleu_scores).unsqueeze(1)
        if torch.cuda.is_available():
            bleu_tensor = bleu_tensor.cuda()
        critic_logits_tensor = torch.stack(critic_logits)
        critic_logits_tensor = critic_logits_tensor.squeeze()
        if len(critic_logits_tensor.shape) == 1:
            critic_logits_tensor = critic_logits_tensor.unsqueeze(1)
        for dict_index in eos_dict:
            critic_logits_tensor[eos_dict[dict_index]:, dict_index] = 0
        critic_logits = torch.unbind(critic_logits_tensor)
        rewards = [(bleu_tensor - logit).squeeze(1) for logit in critic_logits]
        # calculate critic loss
        critic_loss = torch.cat([
            torch.pow(bleu_tensor - logit, 2) for logit in critic_logits
        ]).sum()
        # calculate actor loss
        batch_loss = 0
        for log_prob, critic_logit in zip(actor_log_probabs, critic_logits):
            batch_loss += log_prob.unsqueeze(1) * (bleu_tensor - critic_logit)
        batch_loss = batch_loss.sum()
        return ([batch_loss, critic_loss], log_peakiness(self.pad_index, self.trg_vocab, topk, distributions, trg, batch_size, max_output_length, gold_strings, predicted_strings, rewards, bleu_scores)) \
        if log_probabilities else ([batch_loss, critic_loss], [])

Beispiel #12

Datei anzeigen

    def mrt(self,
            max_output_length,
            src: Tensor,
            trg: Tensor,
            src_mask: Tensor,
            src_length: Tensor,
            temperature: float,
            samples: int,
            alpha: float,
            topk: int,
            add_gold=False,
            log_probabilities=False,
            pickle_logs=False):
        """ Computes forward pass for MRT
        
        Encodes source, samples multiple output sequences.
        Coputes rewards and MRT-loss

        :param max_output_length: max output length
        :param src: source input
        :param trg: target input
        :param src_mask: source mask
        :param src_length: length of source inputs
        :param temperature: softmax temperature
        :param samples: number of sampled sentences for MRT
        :param alpha: smootheness of MRT
        :param topk: consider top-k parameters for logging
        :param add_gold: add gold translation
        :param log_probabilities: log probabilities
        :return: loss, probability logs
        """
        if add_gold:
            samples = samples + 1
        encoder_output, encoder_hidden = self._encode(src, src_length,
                                                      src_mask)
        # if maximum output length is not globally specified, adapt to src len
        if max_output_length is None:
            max_output_length = int(max(src_length.cpu().numpy()) * 1.5)
        batch_size = src_mask.size(0)
        ys = encoder_output.new_full([batch_size, 1],
                                     self.bos_index,
                                     dtype=torch.long)
        trg_mask = src_mask.new_ones([1, 1, 1])
        total_prob = 0
        distributions = []
        attention_vectors = None
        encoder_output = encoder_output.repeat(samples, 1, 1)
        if hasattr(self.decoder, '_init_hidden'):
            hidden = self.decoder._init_hidden(encoder_hidden)
            if len(hidden) == 2:
                hidden = (hidden[0].repeat(1, samples,
                                           1), hidden[1].repeat(1, samples, 1))
            else:
                hidden = hidden.repeat(1, samples, 1)
        else:
            hidden = (0, 0)
        # repeat tensor for vectorized solution
        ys = ys.repeat(samples, 1)
        src_mask = src_mask.repeat(samples, 1, 1)
        finished = src_mask.new_zeros((batch_size * samples)).byte()
        # decode tokens
        for i in range(max_output_length):
            previous_words = ys[:, -1].view(-1, 1) if hasattr(
                self.decoder, '_init_hidden') else ys
            logits, hidden, _, attention_vectors = self.decoder(
                trg_embed=self.trg_embed(previous_words),
                encoder_output=encoder_output,
                encoder_hidden=encoder_hidden,
                src_mask=src_mask,
                unroll_steps=1,
                hidden=hidden,
                prev_att_vector=attention_vectors,
                trg_mask=trg_mask)
            logits = logits[:, -1] / temperature
            distrib = Categorical(logits=logits)
            distributions.append(distrib)
            next_word = distrib.sample()
            if add_gold:
                if i < trg.shape[1]:
                    ith_column = trg[:, i]
                else:
                    tensor = torch.ones((batch_size, ), dtype=torch.int64)
                    data = [self.pad_index] * batch_size
                    ith_column = tensor.new_tensor(data)
                next_word[-batch_size:] = ith_column
            ys = torch.cat([ys, next_word.unsqueeze(-1)], dim=1)
            total_prob += distrib.log_prob(next_word)
            # prevent early stopping in decoding when logging gold token
            if not pickle_logs:
                # check if previous symbol was <eos>
                is_eos = torch.eq(next_word, self.eos_index)
                finished += is_eos
                # stop predicting if <eos> reached for all elements in batch
                if (finished >= 1).sum() == batch_size * samples:
                    break
        ys = ys[:, 1:]
        all_sequences = torch.stack(torch.split(ys, batch_size))
        sentence_probabs = list(torch.split(total_prob, batch_size))
        predicted_outputs = [
            self.trg_vocab.arrays_to_sentences(arrays=sequ, cut_at_eos=True)
            for sequ in all_sequences
        ]
        gold_output = self.trg_vocab.arrays_to_sentences(arrays=trg,
                                                         cut_at_eos=True)
        predicted_sentences = [[
            join_strings(wordlist) for wordlist in predicted_output
        ] for predicted_output in predicted_outputs]
        gold_strings = [join_strings(wordlist) for wordlist in gold_output]
        all_gold_sentences = [gold_strings] * samples
        # Simon's trick
        list_of_Qs = torch.softmax(torch.stack(sentence_probabs) * alpha, 0)
        # calculate loss
        batch_loss = 0
        for index, Q in enumerate(list_of_Qs):
            for prediction, gold_ref, Q_iter in zip(predicted_sentences[index],
                                                    all_gold_sentences[index],
                                                    Q):
                batch_loss -= bleu([prediction], [gold_ref]) * Q_iter
        rewards = [
            bleu([prediction], [gold_ref]) for prediction, gold_ref in zip(
                predicted_sentences[-1], all_gold_sentences[-1])
        ]
        # currently unused
        Qs_to_return = [q.tolist() for q in list_of_Qs]
        return (batch_loss, log_peakiness(self.pad_index, self.trg_vocab, topk, distributions, \
            trg, batch_size, max_output_length, gold_strings, predicted_sentences, \
                Qs_to_return, rewards, mrt=True, samples=samples)) \
                if log_probabilities else (batch_loss, [])