def sample_beam(self, fc_feats, att_feats, opt={}):

        beam_size = opt.get('beam_size', 10)
        batch_size = att_feats.size(0)

        # fc_feats: batch_size * model_size
        # att_feats: batch_size * att_size * model_size
        fc_feats, att_feats = self.embed_feats(fc_feats, att_feats)

        # fc_feats: (batch_size * beam_size) * model_size
        new_fc_feats_size = (fc_feats.size(0) * beam_size, fc_feats.size(1))
        fc_feats = Variable(fc_feats.data.repeat(
            1, beam_size).view(new_fc_feats_size),
                            volatile=True)

        # att_feats: (batch_size * beam_size) * att_size * model_size
        new_output_enc_size = (att_feats.size(0) * beam_size,
                               att_feats.size(1), att_feats.size(2))
        output_enc = Variable(att_feats.data.repeat(
            1, beam_size, 1).view(new_output_enc_size),
                              volatile=True)

        # Prepare beams
        beams = [Beam(beam_size) for _ in range(batch_size)]
        beam_inst_idx_map = {
            beam_idx: inst_idx
            for inst_idx, beam_idx in enumerate(range(batch_size))
        }
        n_remaining_sents = batch_size

        # Decode
        for i in range(self.seq_length + 1):

            len_dec_seq = i + 1

            # (n_remaining_sents*beam_size) * len_dec_seq
            masks = torch.FloatTensor(n_remaining_sents * beam_size,
                                      len_dec_seq).fill_(1)
            masks = Variable(masks.cuda())

            # n_remaining_sents * beam_size * len_dec_seq
            dec_partial_seq = torch.stack(
                [b.get_current_state() for b in beams if not b.done])
            # (n_remaining_sents * beam_size) * len_dec_seq
            dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq)
            dec_partial_seq = Variable(dec_partial_seq, volatile=True).cuda()

            # size: 1 * len_dec_1_seq
            dec_partial_pos = torch.arange(1, len_dec_seq + 1).unsqueeze(0)
            # size: (n_remaining_sents * beam_size) * len_dec_seq
            dec_partial_pos = dec_partial_pos.repeat(
                n_remaining_sents * beam_size, 1)
            dec_partial_pos = Variable(dec_partial_pos.type(torch.LongTensor),
                                       volatile=True).cuda()

            # dec_partial_seq: (n_remaining_sents * beam_size) * len_dec_seq
            # dec_partial_pos: (n_remaining_sents * beam_size) * len_dec_1_seq
            # output_enc: (n_remaining_sents * beam_size) * len_q1 * model_size
            # masks: (n_remaining_sents * beam_size) * len_dec_1_seq
            # output_dec: (n_remaining_sents * beam_size) * len_dec_seq * model_size
            output_dec = self.decoder(dec_partial_seq, dec_partial_pos,
                                      fc_feats, output_enc, masks)

            # (n_remaining_sents * beam_size) * model_size
            output_dec = output_dec[:, -1, :]

            # (n_remaining_sents * beam_size) * (vocab_size+1)
            output = F.log_softmax(self.proj(output_dec), -1)

            # n_remaining_sents * beam_size * (vocab_size+1)
            word_lk = output.view(n_remaining_sents, beam_size,
                                  -1).contiguous()

            active_beam_idx_list = []
            for beam_idx in range(batch_size):
                if beams[beam_idx].done:
                    continue

                inst_idx = beam_inst_idx_map[beam_idx]
                if not beams[beam_idx].advance(word_lk.data[inst_idx]):
                    active_beam_idx_list += [beam_idx]

            if not active_beam_idx_list:
                # all instances have finished their path to <EOS>
                break

            # in this section, the sentences that are still active are
            # compacted so that the decoder is not run on completed sentences
            active_inst_idxs = [
                beam_inst_idx_map[k] for k in active_beam_idx_list
            ]
            active_inst_idxs = torch.LongTensor(active_inst_idxs).cuda()

            # update the idx mapping
            beam_inst_idx_map = {
                beam_idx: inst_idx
                for inst_idx, beam_idx in enumerate(active_beam_idx_list)
            }

            # enc_info_var: (n_remaining_sents * beam_size) * len_q1 * model_size
            def update_active_enc_info(enc_info_var, active_inst_idxs):
                ''' Remove the encoder outputs of finished instances in one batch. '''

                inst_idx_dim_size, rest_dim_size1, rest_dim_size2 = enc_info_var.size(
                )
                inst_idx_dim_size = inst_idx_dim_size * len(
                    active_inst_idxs) // n_remaining_sents
                new_size = (inst_idx_dim_size, rest_dim_size1, rest_dim_size2)

                # select the active instances in batch
                # original_enc_info_data: n_remaining_sents * (beam_size * len_q1) * model_size
                original_enc_info_data = enc_info_var.data.view(
                    n_remaining_sents, -1, self.model_size)
                active_enc_info_data = original_enc_info_data.index_select(
                    0, active_inst_idxs)
                active_enc_info_data = active_enc_info_data.view(*new_size)

                # active_enc_info_data: (inst_idx_dim_size * beam_size) * len_q1 * model_size
                return Variable(active_enc_info_data, volatile=True)

            # enc_info_var: (n_remaining_sents * beam_size) * model_size
            def update_active_fc_feats(enc_info_var, active_inst_idxs):
                ''' Remove the encoder outputs of finished instances in one batch. '''

                inst_idx_dim_size, rest_dim_size1 = enc_info_var.size()
                inst_idx_dim_size = inst_idx_dim_size * len(
                    active_inst_idxs) // n_remaining_sents
                new_size = (inst_idx_dim_size, rest_dim_size1)

                # select the active instances in batch
                # original_enc_info_data: n_remaining_sents * beam_size * model_size
                original_enc_info_data = enc_info_var.data.view(
                    n_remaining_sents, -1, self.model_size)
                active_enc_info_data = original_enc_info_data.index_select(
                    0, active_inst_idxs)
                active_enc_info_data = active_enc_info_data.view(new_size)

                # active_enc_info_data: (inst_idx_dim_size * beam_size) * len_q1 * model_size
                return Variable(active_enc_info_data, volatile=True)

            # fc_feats: (inst_idx_dim_size * beam_size) * model_size
            fc_feats = update_active_fc_feats(fc_feats, active_inst_idxs)

            # output_enc: (inst_idx_dim_size * beam_size) * len_q1 * model_size
            output_enc = update_active_enc_info(output_enc, active_inst_idxs)

            # - update the remaining size
            n_remaining_sents = len(active_inst_idxs)

        # - Return useful information
        # batch_size * len_q * n_best
        all_hyp, all_scores = [], []
        n_best = self.n_best

        for beam_idx in range(batch_size):
            scores, tail_idxs = beams[beam_idx].sort_scores()
            all_scores += [scores[:n_best]]

            hyps = [
                beams[beam_idx].get_hypothesis(i) for i in tail_idxs[:n_best]
            ]
            all_hyp += [hyps]

        seq = torch.LongTensor(batch_size, self.seq_length + 1).zero_()
        for i in range(batch_size):
            for j in range(len(all_hyp[i][0])):
                seq[i, j] = all_hyp[i][0][j]

        # batch_size * seq_len
        seqLogprobs = all_scores

        return seq, seqLogprobs
    def translate_batch(self, src_batch, beam):
        self.model.eval()

        # Beam size, or beam width, is a parameter in the beam search algorithm which determines how many of
        # the best partial solutions to evaluate. In an LSTM model of melody generation, for example, beam size limits the number of
        # candidates to take as input for the decoder. A beam size of 1 is a best-first search - only the most probable candidate is chosen
        # as input for the decoder. A beam size of k will decode and evaluate the top k candidates. A large beam size means a more
        # extensive search - not only the single best candidate is evaluated.

        # Batch size is in different location depending on data.
        src_seq, src_pos = src_batch
        batch_size = src_seq.size(0)
        beam_size = beam
        # beam_size = self.trans_opt.beam_size
        print("beam_size")
        print(beam_size)

        # Encode
        enc_outputs, enc_slf_attns = self.model.encoder(
            src_seq, src_pos)  # enc_outputs, beam search, decoder

        # Repeat data for beam
        src_seq = Variable(src_seq.data.repeat(beam_size, 1))
        enc_outputs = [
            Variable(enc_output.data.repeat(beam_size, 1, 1))
            for enc_output in enc_outputs
        ]

        # Prepare beams
        beam = [Beam.Beam(beam_size, self.cuda) for k in range(batch_size)]
        batch_idx = list(range(batch_size))
        n_remaining_sents = batch_size

        # A larger beam generally means a more accurate prediction at the expense of memory and time
        # Beam search is a heuristic search algorithm that uses breadth-first search to build its search tree and reduces the search space
        # by eliminating candidates to reduce the memory and time requirements.

        # Decode
        for i in range(self.trans_opt.max_trans_length):

            len_dec_seq = i + 1

            # -- Preparing decode data seq -- #
            input_data = torch.stack([
                b.get_current_state() for b in beam if not b.done
            ])  # size: mb x bm x sq
            input_data = input_data.view(-1, len_dec_seq)  # size: (mb*bm) x sq
            input_data = Variable(input_data, volatile=True)

            # -- Preparing decode pos seq -- #
            # size: 1 x seq
            input_pos = torch.arange(1, len_dec_seq + 1).unsqueeze(0)
            # size: (batch * beam) x seq
            input_pos = input_pos.repeat(n_remaining_sents * beam_size, 1)
            input_pos = Variable(input_pos.type(torch.LongTensor),
                                 volatile=True)

            if self.cuda:
                input_pos = input_pos.cuda()
                input_data = input_data.cuda()

            # -- Decoding -- #
            dec_outputs, dec_slf_attns, dec_enc_attns = self.model.decoder(
                input_data, input_pos, src_seq, enc_outputs)
            dec_output = dec_outputs[-1][:, -1, :]  # (batch * beam) * d_model
            dec_output = self.model.tgt_word_proj(dec_output)
            out = self.model.prob_projection(dec_output)

            # batch x beam x n_words
            word_lk = out.view(n_remaining_sents, beam_size, -1).contiguous()

            active = []
            for b in range(batch_size):
                if beam[b].done:
                    continue

                idx = batch_idx[b]
                if not beam[b].advance(word_lk.data[idx]):
                    active += [b]

            if not active:
                break

            # in this section, the sentences that are still active are
            # compacted so that the decoder is not run on completed sentences
            active_idx = self.tt.LongTensor([batch_idx[k] for k in active])
            batch_idx = {beam: idx for idx, beam in enumerate(active)}

            def update_active_enc_info(tensor_var, active_idx):
                ''' Remove the encoder outputs of finished instances in one batch. '''
                tensor_data = tensor_var.data.view(n_remaining_sents, -1,
                                                   self.model_opt.d_model)

                new_size = list(tensor_var.size())
                new_size[0] = new_size[0] * len(
                    active_idx) // n_remaining_sents

                # select the active index in batch
                return Variable(tensor_data.index_select(
                    0, active_idx).view(*new_size),
                                volatile=True)

            def update_active_seq(seq, active_idx):
                ''' Remove the src sequence of finished instances in one batch. '''
                view = seq.data.view(n_remaining_sents, -1)
                new_size = list(seq.size())
                new_size[0] = new_size[0] * len(
                    active_idx) // n_remaining_sents  # trim on batch dim

                # select the active index in batch
                return Variable(view.index_select(0,
                                                  active_idx).view(*new_size),
                                volatile=True)

            src_seq = update_active_seq(src_seq, active_idx)
            enc_outputs = [
                update_active_enc_info(enc_output, active_idx)
                for enc_output in enc_outputs
            ]
            n_remaining_sents = len(active)

        # Return useful information
        all_hyp, all_scores = [], []
        n_best = self.trans_opt.n_best

        for b in range(batch_size):
            scores, ks = beam[b].sort_scores()
            all_scores += [scores[:n_best]]
            hyps = [beam[b].get_hypothesis(k) for k in ks[:n_best]]
            all_hyp += [hyps]

        decoded = [self.trans_opt.ctable.decode(hyps[0])
                   for hyps in all_hyp]  # all_hyp

        return decoded, all_hyp, all_scores, enc_outputs, dec_outputs, enc_slf_attns, dec_slf_attns, dec_enc_attns