Beispiel #1
0
def beam_decode(decoder_context,
                decoder_hidden,
                encoder_outputs,
                max_len,
                beam_size=5):
    batch_size = args.beam_size
    vocab_size = output_lang.n_words
    # [1, batch_size x beam_size]
    decoder_input = torch.ones(batch_size * beam_size, dtype=torch.long, device=device) * Language.sos_token

    # [num_layers, batch_size x beam_size, hidden_size]
    decoder_hidden = decoder_hidden.repeat(1, beam_size, 1)
    decoder_context = decoder_context.repeat(1, beam_size, 1)

    encoder_outputs = encoder_outputs.repeat(1, beam_size, 1)

    # [batch_size] [0, beam_size * 1, ..., beam_size * (batch_size - 1)]
    batch_position = torch.arange(0, batch_size, dtype=torch.long, device=device) * beam_size

    score = torch.ones(batch_size * beam_size, device=device) * -float('inf')
    score.index_fill_(0, torch.arange(0, batch_size, dtype=torch.long, device=device) * beam_size, 0.0)

    # Initialize Beam that stores decisions for backtracking
    beam = Beam(
        batch_size,
        beam_size,
        max_len,
        batch_position,
        Language.eos_token
    )

    for i in range(max_len):
        decoder_output, decoder_context, decoder_hidden, _ = decoder(decoder_input,
                                                                    decoder_context,
                                                                    decoder_hidden,
                                                                    encoder_outputs)
        # output: [1, batch_size * beam_size, vocab_size]
        # -> [batch_size * beam_size, vocab_size]
        log_prob = decoder_output

        # score: [batch_size * beam_size, vocab_size]
        score = score.view(-1, 1) + log_prob

        # score [batch_size, beam_size]
        score, top_k_idx = score.view(batch_size, -1).topk(beam_size, dim=1)

        # decoder_input: [batch_size x beam_size]
        decoder_input = (top_k_idx % vocab_size).view(-1)

        # beam_idx: [batch_size, beam_size]
        beam_idx = top_k_idx / vocab_size  # [batch_size, beam_size]

        # top_k_pointer: [batch_size * beam_size]
        top_k_pointer = (beam_idx + batch_position.unsqueeze(1)).view(-1)

        # [num_layers, batch_size * beam_size, hidden_size]
        decoder_hidden = decoder_hidden.index_select(1, top_k_pointer)
        decoder_context = decoder_context.index_select(1, top_k_pointer)

        # Update sequence scores at beam
        beam.update(score.clone(), top_k_pointer, decoder_input)

        # Erase scores for EOS so that they are not expanded
        # [batch_size, beam_size]
        eos_idx = decoder_input.data.eq(Language.eos_token).view(batch_size, beam_size)

        if eos_idx.nonzero().dim() > 0:
            score.data.masked_fill_(eos_idx, -float('inf'))

    prediction, final_score, length = beam.backtrack()
    return prediction, final_score, length