Beispiel #1
0
    def test_char_rnn_generate(self):
        test_args = test_utils.ModelParamsDict(sequence_lstm=True)
        test_args.arch = "char_source"
        test_args.char_source_dict_size = 126
        test_args.char_embed_dim = 8
        test_args.char_rnn_units = 12
        test_args.char_rnn_layers = 2

        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        model = task.build_model(test_args)
        translator = beam_decode.SequenceGenerator([model],
                                                   task.target_dictionary,
                                                   use_char_source=True)
        src_tokens = torch.LongTensor([[0, 0, 0], [0, 0, 0]])
        src_lengths = torch.LongTensor([3, 3])
        char_inds = torch.LongTensor(np.zeros((2, 3, 5)))
        word_lengths = torch.LongTensor([[5, 5, 5], [5, 5, 5]])
        encoder_input = {
            "src_tokens": src_tokens,
            "src_lengths": src_lengths,
            "char_inds": char_inds,
            "word_lengths": word_lengths,
        }
        translator.generate(encoder_input, maxlen=7)
Beispiel #2
0
    def test_dual_decoder_args(self):
        test_args = test_utils.ModelParamsDict(arch="dual_decoder_kd")
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        self.task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        model = self.task.build_model(test_args)

        assert (model.encoder.transformer_embedding.embed_tokens.embedding_dim
                == test_args.encoder_embed_dim)
        assert (model.encoder.transformer_encoder_given_embeddings.layers[0].
                fc1.out_features == test_args.encoder_ffn_embed_dim)
        assert (len(model.encoder.transformer_encoder_given_embeddings.layers)
                == test_args.encoder_layers)
        assert (model.encoder.transformer_encoder_given_embeddings.layers[0].
                self_attn.num_heads == test_args.encoder_attention_heads)
        assert (model.teacher_decoder.embed_tokens.embedding_dim ==
                test_args.decoder_embed_dim)
        assert (model.teacher_decoder.layers[0].fc1.out_features ==
                test_args.decoder_ffn_embed_dim)
        assert len(model.teacher_decoder.layers) == test_args.decoder_layers
        assert (model.teacher_decoder.layers[0].self_attn.num_heads ==
                test_args.decoder_attention_heads)
        assert (model.student_decoder.embed_tokens.embedding_dim ==
                test_args.student_decoder_embed_dim)
        assert model.student_decoder.num_layers == test_args.student_decoder_layers
        assert (model.student_decoder.num_attention_heads ==
                test_args.student_decoder_attention_heads)
        assert model.student_decoder.lstm_units == test_args.student_decoder_lstm_units
        assert (model.student_decoder.out_embed_dim ==
                test_args.student_decoder_out_embed_dim)
Beispiel #3
0
 def test_smoothed_sentence_bleu(self):
     """
     Testing calculation of smoothed_sentence_bleu() function.
     Inputs:
         target_tokens: [11, 12, 13, 14, 15]
         hypo_tokens: [11, 12, 14, 15]
         actual precision:
             unigram: 4/4 = 1
             bigram:  2/3 = 0.667
             trigram: 0/2 = 0
             4-gram:  0/1 = 0
         smoothed precision:
             unigram: 4/4    = 1
             bigram:  2/3    = 0.667
             trigram: 0.5/2  = 0.25
             4-gram:  0.25/1 = 0.25
         smoothed geom. mean: (1 * 2/3 * 1/4 * 1/4) ^ (1/4) = 0.4518
         brevity penalty: e ^ (1 - 5/4) = 0.7788
     Desired Output:
         0.4518 * 0.7788 = 0.35186
     """
     test_args = test_utils.ModelParamsDict()
     _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
     task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
     target_tokens = torch.IntTensor([11, 12, 13, 14, 15])
     hypo_tokens = torch.IntTensor([11, 12, 14, 15])
     smoothed_bleu = generate.smoothed_sentence_bleu(
         task, target_tokens, hypo_tokens)
     np.testing.assert_almost_equal(smoothed_bleu, 0.35186, decimal=5)
Beispiel #4
0
    def test_dual_decoder_kd_loss(self):
        test_args = test_utils.ModelParamsDict(arch="dual_decoder_kd")
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        self.task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        sample = self._dummy_sample()
        model = self.task.build_model(test_args)

        test_args.kd_weight = 0.5
        test_args.label_smoothing = 0.1
        criterion = dual_decoder_kd_loss.DualDecoderCriterion(
            test_args, self.task)

        src_tokens = sample["net_input"]["src_tokens"]
        src_lengths = sample["net_input"]["src_lengths"]
        prev_output_tokens = sample["net_input"]["prev_output_tokens"]

        encoder_out = model.encoder(src_tokens, src_lengths)
        student_output = model.student_decoder(prev_output_tokens, encoder_out)
        teacher_output = model.teacher_decoder(prev_output_tokens, encoder_out)

        teacher_loss, teacher_nll_loss, teacher_probs = criterion.compute_teacher_loss(
            model, teacher_output, sample, reduce=True)

        # probabilities for each label should sum to one
        assert all((teacher_probs.sum(dim=1) - 1.0).abs() < 1e-6)

        student_loss, student_nll_loss = criterion.compute_student_loss(
            model, student_output, sample, teacher_probs, reduce=True)
    def test_topk_kd_loss(self):
        """
        Makes sure that we can build KD loss without problem.
        """
        test_args = test_utils.ModelParamsDict()
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        self.task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        sample = self._dummy_sample()
        model = self.task.build_model(test_args)
        net_output = model(**sample["net_input"])
        lprobs = model.get_normalized_probs(net_output, log_probs=True)
        # [bsz, seqlen, vocab] -> [bsz*seqlen, vocab]
        lprobs = lprobs.view(-1, lprobs.size(-1))

        teacher_model = self.task.build_model(test_args)
        teacher_probs = teacher_model.get_normalized_probs(net_output,
                                                           log_probs=False)
        teacher_probs = teacher_probs.view(-1, teacher_probs.size(-1)).detach()

        # Getting the topk probabilities, masking others, normalizing the topk
        # probabilities.
        top_k_probs, indices = torch.topk(teacher_probs, k=3)
        top_k_probs_normalized = top_k_probs / torch.sum(top_k_probs)
        topk_mask = torch.zeros(teacher_probs.shape)
        topk_probs = topk_mask.scatter(1, indices, top_k_probs_normalized)

        # asserting that the values are correctly asserted into teacher_probs.
        for i, index in enumerate(indices):
            print(index, i)
            for j in index:
                if teacher_probs[i][j] > 0:
                    assert topk_probs[i][j] > 0

        kd_loss = -torch.sum(topk_probs * lprobs)
        assert kd_loss >= 0
    def test_collect_top_k_probs(self):
        test_args = test_utils.ModelParamsDict(arch="hybrid_transformer_rnn")
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        self.task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        model = self.task.build_model(test_args)

        use_cuda = torch.cuda.is_available()
        if use_cuda:
            model.cuda()
        model.eval()

        binarized_source = test_utils.create_dummy_binarized_dataset()
        binarized_target = test_utils.create_dummy_binarized_dataset(
            append_eos=True)
        dataset = language_pair_dataset.LanguagePairDataset(
            src=binarized_source,
            src_sizes=binarized_source.sizes,
            src_dict=self.task.src_dict,
            tgt=binarized_target,
            tgt_sizes=binarized_target.sizes,
            tgt_dict=self.task.dst_dict,
            left_pad_source=False,
        )

        top_k_scores, top_k_indices = collect_top_k_probs.compute_top_k(
            task=self.task,
            models=[model],
            dataset=dataset,
            k=3,
            use_cuda=use_cuda,
            max_tokens=None,
            max_sentences=None,
            progress_bar_args=None,
        )

        batch = language_pair_dataset.collate(
            [dataset[0]],
            pad_idx=self.task.src_dict.pad(),
            eos_idx=self.task.src_dict.eos(),
            left_pad_source=False,
        )

        sample = batch["net_input"]
        if use_cuda:
            sample = utils.move_to_cuda(sample)

        with torch.no_grad():
            net_output = model(**sample)
            probs = model.get_normalized_probs(net_output, log_probs=False)

        top_probs, top_indices = torch.topk(probs[0, 0], k=3)
        if use_cuda:
            top_probs = top_probs.cpu()
            top_indices = top_indices.cpu()

        np.testing.assert_array_equal(top_k_indices[0], top_indices.numpy())
        normalized_probs = (top_probs / top_probs.sum()).numpy()
        np.testing.assert_almost_equal(top_k_scores[0], normalized_probs)
Beispiel #7
0
    def test_char_rnn_equivalent(self):
        """Ensure that the CharRNNEncoder.onnx_export_model path does not
        change computation"""
        test_args = test_utils.ModelParamsDict(
            encoder_bidirectional=True, sequence_lstm=True
        )
        lexical_dictionaries = test_utils.create_lexical_dictionaries()
        test_args.vocab_reduction_params = {
            "lexical_dictionaries": lexical_dictionaries,
            "num_top_words": 5,
            "max_translation_candidates_per_word": 1,
        }

        test_args.arch = "char_source"
        test_args.char_source_dict_size = 126
        test_args.char_embed_dim = 8
        test_args.char_rnn_units = 12
        test_args.char_rnn_layers = 2

        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))
        encoder_ensemble = CharSourceEncoderEnsemble(model_list)

        length = 5
        src_tokens = torch.LongTensor(
            np.random.randint(0, len(src_dict), (length, 1), dtype="int64")
        )
        src_lengths = torch.IntTensor(np.array([length], dtype="int32"))
        word_length = 3
        char_inds = torch.LongTensor(
            np.random.randint(0, 126, (1, length, word_length), dtype="int64")
        )
        word_lengths = torch.IntTensor(
            np.array([word_length] * length, dtype="int32")
        ).reshape((1, length))

        onnx_path_outputs = encoder_ensemble(
            src_tokens, src_lengths, char_inds, word_lengths
        )

        for model in encoder_ensemble.models:
            model.encoder.onnx_export_model = False

        original_path_outputs = encoder_ensemble(
            src_tokens, src_lengths, char_inds, word_lengths
        )

        for (onnx_out, original_out) in zip(onnx_path_outputs, original_path_outputs):
            onnx_array = onnx_out.detach().numpy()
            original_array = original_out.detach().numpy()
            assert onnx_array.shape == original_array.shape
            np.testing.assert_allclose(onnx_array, original_array)
 def test_basic_generate(self):
     test_args = test_utils.ModelParamsDict()
     _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
     task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
     model = task.build_model(test_args)
     translator = beam_decode.SequenceGenerator([model], task.target_dictionary)
     src_tokens = torch.LongTensor([[0, 0, 0], [0, 0, 0]])
     src_lengths = torch.LongTensor([3, 3])
     encoder_input = (src_tokens, src_lengths)
     translator.generate(encoder_input, maxlen=7)
Beispiel #9
0
 def test_load_pretrained_embedding(self):
     test_args = test_utils.ModelParamsDict()
     _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
     encoder_embed_path, embed_weights = test_utils.create_pretrained_embed(
         src_dict, test_args.encoder_hidden_dim)
     test_args.encoder_pretrained_embed = encoder_embed_path
     task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
     model = task.build_model(test_args)
     assert np.allclose(model.encoder.embed_tokens.weight.data.numpy(),
                        embed_weights)
     os.remove(encoder_embed_path)
Beispiel #10
0
def gpu_train_step(test_args: ModelParamsDict) -> Tuple[Trainer, Dict[Any, Any]]:
    """Sets up inputs from test_args then executes a single train step. A train
    step always requires a GPU."""
    samples, src_dict, tgt_dict = prepare_inputs(test_args)
    task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
    model = task.build_model(test_args)
    criterion = task.build_criterion(test_args)
    sample = next(samples)
    trainer = Trainer(test_args, task, model, criterion, dummy_batch=sample)
    logging_dict = trainer.train_step([sample])
    return trainer, logging_dict
Beispiel #11
0
 def _gpu_train_step(self, test_args):
     samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
     task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
     model = task.build_model(test_args)
     criterion = task.build_criterion(test_args)
     sample = next(samples)
     trainer = Trainer(test_args,
                       task,
                       model,
                       criterion,
                       dummy_batch=sample)
     logging_dict = trainer.train_step([sample])
     return trainer, logging_dict
Beispiel #12
0
    def _test_forced_decoder_export(self, test_args):
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))

        forced_decoder_ensemble = ForcedDecoder(
            model_list, tgt_dict, word_reward=0.25, unk_reward=-0.5
        )

        tmp_dir = tempfile.mkdtemp()
        forced_decoder_pb_path = os.path.join(tmp_dir, "forced_decoder.pb")
        forced_decoder_ensemble.onnx_export(forced_decoder_pb_path)
Beispiel #13
0
    def _test_ensemble_encoder_export_char_source(self, test_args):
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))
        encoder_ensemble = CharSourceEncoderEnsemble(model_list)

        tmp_dir = tempfile.mkdtemp()
        encoder_pb_path = os.path.join(tmp_dir, "char_encoder.pb")
        encoder_ensemble.onnx_export(encoder_pb_path)

        length = 5
        src_tokens = torch.LongTensor(np.ones((length, 1), dtype="int64"))
        src_lengths = torch.IntTensor(np.array([length], dtype="int32"))
        word_length = 3
        char_inds = torch.LongTensor(
            np.ones((1, length, word_length), dtype="int64"))
        word_lengths = torch.IntTensor(
            np.array([word_length] * length, dtype="int32")).reshape(
                (1, length))

        pytorch_encoder_outputs = encoder_ensemble(src_tokens, src_lengths,
                                                   char_inds, word_lengths)

        onnx_encoder = caffe2_backend.prepare_zip_archive(encoder_pb_path)

        caffe2_encoder_outputs = onnx_encoder.run((
            src_tokens.numpy(),
            src_lengths.numpy(),
            char_inds.numpy(),
            word_lengths.numpy(),
        ))

        for i in range(len(pytorch_encoder_outputs)):
            caffe2_out_value = caffe2_encoder_outputs[i]
            pytorch_out_value = pytorch_encoder_outputs[i].detach().numpy()
            np.testing.assert_allclose(caffe2_out_value,
                                       pytorch_out_value,
                                       rtol=1e-4,
                                       atol=1e-6)

        encoder_ensemble.save_to_db(
            os.path.join(tmp_dir, "encoder.predictor_export"))
Beispiel #14
0
    def test_topk_kd_loss(self):
        """
        Makes sure that we can build KD loss without problem.
        """
        test_args = test_utils.ModelParamsDict()
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        self.task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        sample = self._dummy_sample()
        model = self.task.build_model(test_args)
        net_output = model(**sample["net_input"])
        student_probs = model.get_normalized_probs(net_output, log_probs=True)
        # [bsz, seqlen, vocab] -> [bsz*seqlen, vocab]
        lprobs = student_probs.view(-1, student_probs.size(-1))

        teacher_model = self.task.build_model(test_args)
        teacher_probs = teacher_model.get_normalized_probs(net_output,
                                                           log_probs=False)
        top_k_teacher_probs, indices = torch.topk(teacher_probs, k=3)
        top_k_teacher_probs_normalized = F.normalize(top_k_teacher_probs,
                                                     p=1,
                                                     dim=2).detach()
        sample["top_k_scores"] = top_k_teacher_probs_normalized
        sample["top_k_indices"] = indices

        kd_criterion = knowledge_distillation_loss.KnowledgeDistillationCriterion(
            test_args, self.task)
        kd_loss, topk_probs = kd_criterion.get_kd_loss(sample, student_probs,
                                                       lprobs)

        # asserting that the values are correctly inserted into teacher_probs.
        for row in range(indices.shape[0]):
            for col in range(indices.shape[1]):
                # testing if values are normalized.
                assert round(float(torch.sum(topk_probs[row][col][:])),
                             0) == 1.0
                for i, val in enumerate(indices[row][col]):
                    # testing if scattering is done correctly.
                    assert (topk_probs[row][col][val] ==
                            top_k_teacher_probs_normalized[row][col][i])

        topk_probs_flat = topk_probs.view(-1, topk_probs.size(-1))
        kd_loss = -torch.sum(topk_probs_flat * lprobs)
        assert kd_loss >= 0
Beispiel #15
0
 def test_diversity_sibling_rank(self):
     """
     Testing calculation of sibling_rank() function.
     """
     test_args = test_utils.ModelParamsDict()
     _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
     task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
     model = task.build_model(test_args)
     translator = beam_decode.SequenceGenerator([model],
                                                task.target_dictionary)
     logprobs = torch.FloatTensor([[[2, 1, 3, 5, 6], [0, 1, 3, 2, 4]],
                                   [[2, 3, 1, 5, 0], [3, 1, 5, 2, 0]]])
     logprobs_out = torch.FloatTensor([
         [[-1, -3, 1, 4, 6], [-4, -2, 2, 0, 4]],
         [[0, 2, -2, 5, -4], [2, -2, 5, 0, -4]],
     ])
     logprobs = translator.diversity_sibling_rank(logprobs, 1)
     np.testing.assert_allclose(actual=logprobs_out.view(-1, 5).numpy(),
                                desired=logprobs.numpy(),
                                atol=1e-5)
Beispiel #16
0
    def _test_ensemble_encoder_export(self, test_args):
        samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))
        encoder_ensemble = EncoderEnsemble(model_list)

        tmp_dir = tempfile.mkdtemp()
        encoder_pb_path = os.path.join(tmp_dir, "encoder.pb")
        encoder_ensemble.onnx_export(encoder_pb_path)

        # test equivalence
        # The discrepancy in types here is a temporary expedient.
        # PyTorch indexing requires int64 while support for tracing
        # pack_padded_sequence() requires int32.
        sample = next(samples)
        src_tokens = sample["net_input"]["src_tokens"][0:1].t()
        src_lengths = sample["net_input"]["src_lengths"][0:1].int()

        pytorch_encoder_outputs = encoder_ensemble(src_tokens, src_lengths)

        onnx_encoder = caffe2_backend.prepare_zip_archive(encoder_pb_path)

        caffe2_encoder_outputs = onnx_encoder.run(
            (src_tokens.numpy(), src_lengths.numpy()))

        for i in range(len(pytorch_encoder_outputs)):
            caffe2_out_value = caffe2_encoder_outputs[i]
            pytorch_out_value = pytorch_encoder_outputs[i].detach().numpy()
            np.testing.assert_allclose(caffe2_out_value,
                                       pytorch_out_value,
                                       rtol=1e-4,
                                       atol=1e-6)

        encoder_ensemble.save_to_db(
            os.path.join(tmp_dir, "encoder.predictor_export"))
    def test_topk_kd_loss(self):
        """
        Makes sure that we can build KD loss without problem.
        """
        test_args = test_utils.ModelParamsDict()
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        self.task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        sample = self._dummy_sample()
        model = self.task.build_model(test_args)
        net_output = model(**sample["net_input"])
        student_lprobs = model.get_normalized_probs(net_output, log_probs=True)
        # [bsz, seqlen, vocab] -> [bsz*seqlen, vocab]
        lprobs = student_lprobs.view(-1, student_lprobs.size(-1))

        teacher_model = self.task.build_model(test_args)
        teacher_probs = teacher_model.get_normalized_probs(net_output,
                                                           log_probs=False)
        top_k_teacher_probs, indices = torch.topk(teacher_probs, k=3)
        top_k_teacher_probs_normalized = F.normalize(top_k_teacher_probs,
                                                     p=1,
                                                     dim=2).detach()
        sample["top_k_scores"] = top_k_teacher_probs_normalized
        sample["top_k_indices"] = indices

        kd_criterion = (knowledge_distillation_loss.
                        KnowledgeDistillationCriterion.build_criterion(
                            test_args, self.task))
        kd_loss = kd_criterion.get_kd_loss(sample, student_lprobs, lprobs)

        # Calculate kd_loss using full matrix and compare
        topk_mask = torch.zeros(student_lprobs.shape).type_as(student_lprobs)
        topk_probs = topk_mask.scatter(2, indices,
                                       top_k_teacher_probs_normalized.float())
        topk_probs_flat = topk_probs.view(-1, topk_probs.size(-1))
        kd_loss_2 = -(torch.sum(topk_probs_flat * lprobs))
        np.testing.assert_almost_equal(kd_loss.item(),
                                       kd_loss_2.item(),
                                       decimal=4)
        assert kd_loss >= 0
Beispiel #18
0
    def _test_beam_component_equivalence(self, test_args):
        beam_size = 5
        samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))

        # to initialize BeamSearch object
        sample = next(samples)
        # [seq len, batch size=1]
        src_tokens = sample["net_input"]["src_tokens"][0:1].t()
        # [seq len]
        src_lengths = sample["net_input"]["src_lengths"][0:1].long()

        beam_size = 5
        full_beam_search = BeamSearch(model_list,
                                      tgt_dict,
                                      src_tokens,
                                      src_lengths,
                                      beam_size=beam_size)

        encoder_ensemble = EncoderEnsemble(model_list)

        # to initialize decoder_step_ensemble
        with torch.no_grad():
            pytorch_encoder_outputs = encoder_ensemble(src_tokens, src_lengths)

        decoder_step_ensemble = DecoderBatchedStepEnsemble(model_list,
                                                           tgt_dict,
                                                           beam_size=beam_size)

        prev_token = torch.LongTensor([tgt_dict.eos()])
        prev_scores = torch.FloatTensor([0.0])
        attn_weights = torch.zeros(src_tokens.shape[0])
        prev_hypos_indices = torch.zeros(beam_size, dtype=torch.int64)
        num_steps = torch.LongTensor([2])

        with torch.no_grad():
            (
                bs_out_tokens,
                bs_out_scores,
                bs_out_weights,
                bs_out_prev_indices,
            ) = full_beam_search(
                src_tokens,
                src_lengths,
                prev_token,
                prev_scores,
                attn_weights,
                prev_hypos_indices,
                num_steps,
            )

        comp_out_tokens = (np.ones([num_steps + 1, beam_size], dtype="int64") *
                           tgt_dict.eos())
        comp_out_scores = np.zeros([num_steps + 1, beam_size])
        comp_out_weights = np.zeros(
            [num_steps + 1, beam_size,
             src_lengths.numpy()[0]])
        comp_out_prev_indices = np.zeros([num_steps + 1, beam_size],
                                         dtype="int64")

        # single EOS in flat array
        input_tokens = torch.LongTensor(np.array([tgt_dict.eos()]))
        prev_scores = torch.FloatTensor(np.array([0.0]))
        timestep = torch.LongTensor(np.array([0]))

        with torch.no_grad():
            pytorch_first_step_outputs = decoder_step_ensemble(
                input_tokens, prev_scores, timestep, *pytorch_encoder_outputs)

        comp_out_tokens[1, :] = pytorch_first_step_outputs[0]
        comp_out_scores[1, :] = pytorch_first_step_outputs[1]
        comp_out_prev_indices[1, :] = pytorch_first_step_outputs[2]
        comp_out_weights[1, :, :] = pytorch_first_step_outputs[3]

        next_input_tokens = pytorch_first_step_outputs[0]
        next_prev_scores = pytorch_first_step_outputs[1]
        timestep += 1

        # Tile states after first timestep
        next_states = list(pytorch_first_step_outputs[4:])
        for i in range(len(model_list)):
            next_states[i] = next_states[i].repeat(1, beam_size, 1)

        with torch.no_grad():
            pytorch_next_step_outputs = decoder_step_ensemble(
                next_input_tokens, next_prev_scores, timestep, *next_states)

        comp_out_tokens[2, :] = pytorch_next_step_outputs[0]
        comp_out_scores[2, :] = pytorch_next_step_outputs[1]
        comp_out_prev_indices[2, :] = pytorch_next_step_outputs[2]
        comp_out_weights[2, :, :] = pytorch_next_step_outputs[3]

        np.testing.assert_array_equal(comp_out_tokens, bs_out_tokens.numpy())
        np.testing.assert_allclose(comp_out_scores,
                                   bs_out_scores.numpy(),
                                   rtol=1e-4,
                                   atol=1e-6)
        np.testing.assert_array_equal(comp_out_prev_indices,
                                      bs_out_prev_indices.numpy())
        np.testing.assert_allclose(comp_out_weights,
                                   bs_out_weights.numpy(),
                                   rtol=1e-4,
                                   atol=1e-6)
Beispiel #19
0
    def _test_full_beam_decoder(self, test_args):
        samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        sample = next(samples)
        src_tokens = sample["net_input"]["src_tokens"][0:1].t()
        src_lengths = sample["net_input"]["src_lengths"][0:1].int()

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))

        bs = BeamSearch(model_list,
                        tgt_dict,
                        src_tokens,
                        src_lengths,
                        beam_size=6)
        prev_token = torch.LongTensor([0])
        prev_scores = torch.FloatTensor([0.0])
        attn_weights = torch.zeros(11)
        prev_hypos_indices = torch.zeros(6, dtype=torch.int64)

        outs = bs(
            src_tokens,
            src_lengths,
            prev_token,
            prev_scores,
            attn_weights,
            prev_hypos_indices,
            torch.LongTensor([20]),
        )

        import io

        f = io.BytesIO()
        torch.onnx._export(
            bs,
            (
                src_tokens,
                src_lengths,
                prev_token,
                prev_scores,
                attn_weights,
                prev_hypos_indices,
                torch.LongTensor([20]),
            ),
            f,
            export_params=True,
            verbose=False,
            example_outputs=outs,
        )

        f.seek(0)

        onnx_model = onnx.load(f)
        c2_model = caffe2_backend.prepare(onnx_model)
        c2_model.run((
            src_tokens.numpy(),
            src_lengths.numpy(),
            prev_token.numpy(),
            prev_scores.numpy(),
            attn_weights.numpy(),
            prev_hypos_indices.numpy(),
            np.array([20]),
        ))
Beispiel #20
0
    def _test_batched_beam_decoder_step(self,
                                        test_args,
                                        return_caffe2_rep=False):
        beam_size = 5
        samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))
        encoder_ensemble = EncoderEnsemble(model_list)

        # test equivalence
        # The discrepancy in types here is a temporary expedient.
        # PyTorch indexing requires int64 while support for tracing
        # pack_padded_sequence() requires int32.
        sample = next(samples)
        src_tokens = sample["net_input"]["src_tokens"][0:1].t()
        src_lengths = sample["net_input"]["src_lengths"][0:1].int()

        pytorch_encoder_outputs = encoder_ensemble(src_tokens, src_lengths)

        decoder_step_ensemble = DecoderBatchedStepEnsemble(model_list,
                                                           tgt_dict,
                                                           beam_size=beam_size)

        tmp_dir = tempfile.mkdtemp()
        decoder_step_pb_path = os.path.join(tmp_dir, "decoder_step.pb")
        decoder_step_ensemble.onnx_export(decoder_step_pb_path,
                                          pytorch_encoder_outputs)

        # single EOS in flat array
        input_tokens = torch.LongTensor(np.array([tgt_dict.eos()]))
        prev_scores = torch.FloatTensor(np.array([0.0]))
        timestep = torch.LongTensor(np.array([0]))

        pytorch_first_step_outputs = decoder_step_ensemble(
            input_tokens, prev_scores, timestep, *pytorch_encoder_outputs)

        # next step inputs (input_tokesn shape: [beam_size])
        next_input_tokens = torch.LongTensor(np.array([i
                                                       for i in range(4, 9)]))

        next_prev_scores = pytorch_first_step_outputs[1]
        next_timestep = timestep + 1
        next_states = list(pytorch_first_step_outputs[4:])

        # Tile these for the next timestep
        for i in range(len(model_list)):
            next_states[i] = next_states[i].repeat(1, beam_size, 1)

        pytorch_next_step_outputs = decoder_step_ensemble(
            next_input_tokens, next_prev_scores, next_timestep, *next_states)

        onnx_decoder = caffe2_backend.prepare_zip_archive(decoder_step_pb_path)

        if return_caffe2_rep:
            return onnx_decoder

        decoder_inputs_numpy = [
            next_input_tokens.numpy(),
            next_prev_scores.detach().numpy(),
            next_timestep.detach().numpy(),
        ]
        for tensor in next_states:
            decoder_inputs_numpy.append(tensor.detach().numpy())

        caffe2_next_step_outputs = onnx_decoder.run(
            tuple(decoder_inputs_numpy))

        for i in range(len(pytorch_next_step_outputs)):
            caffe2_out_value = caffe2_next_step_outputs[i]
            pytorch_out_value = pytorch_next_step_outputs[i].detach().numpy()
            np.testing.assert_allclose(caffe2_out_value,
                                       pytorch_out_value,
                                       rtol=1e-4,
                                       atol=1e-6)
        decoder_step_ensemble.save_to_db(
            output_path=os.path.join(tmp_dir, "decoder.predictor_export"),
            encoder_ensemble_outputs=pytorch_encoder_outputs,
        )
Beispiel #21
0
    def test_collate(self):
        """
        Makes sure that we can memoize in collate if we give a particular data index
        in different orders.
        """
        test_args = test_utils.ModelParamsDict()
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        self.task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        teacher_model = pytorch_translate_utils.maybe_cuda(
            self.task.build_model(test_args)
        )

        d0, d1, d2, d3 = self._dummy_datasets(src_dict.eos(), tgt_dict.eos())
        dataset1 = [d0, d1]
        dataset2 = [d2, d3]
        dataset3 = [d3, d0]
        dataset4 = [d1, d2]

        top_k_teacher_scores = {}
        top_k_teacher_indices = {}
        b1 = TeacherDataset.collate(
            dataset1,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        TeacherDataset.collate(
            dataset2,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        before_scores = [top_k_teacher_scores[i].cpu().numpy() for i in range(4)]
        before_indices = [top_k_teacher_indices[i].cpu().numpy() for i in range(4)]

        TeacherDataset.collate(
            dataset3,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        TeacherDataset.collate(
            dataset4,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        after_scores = [top_k_teacher_scores[i].cpu().numpy() for i in range(4)]
        after_indices = [top_k_teacher_indices[i].cpu().numpy() for i in range(4)]

        for i in range(4):
            np.array_equal(after_scores[i], before_scores[i])
            np.array_equal(after_indices[i], before_indices[i])

        b5 = TeacherDataset.collate(
            dataset1,
            [teacher_model],
            3,
            src_dict.pad(),
            src_dict.eos(),
            top_k_teacher_scores,
            top_k_teacher_indices,
        )
        probs_before = b1["top_k_scores"].numpy()
        indices_before = b1["top_k_indices"].numpy()
        probs_after = b5["top_k_scores"].numpy()
        indices_after = b5["top_k_indices"].numpy()

        # The first one has a different length, does the last two values in the
        # before value has irrelevant values.abs
        assert np.array_equal(probs_before[0][:-4], probs_after[0][:-4]) is True
        assert np.array_equal(indices_before[0][:-4], indices_after[0][:-4]) is True
        assert np.array_equal(probs_after[0][-4:], np.zeros((4, 3))) is True
        assert np.array_equal(indices_after[0][-4:], np.zeros((4, 3))) is True

        assert np.array_equal(probs_before[1], probs_after[1]) is True
        assert np.array_equal(indices_before[1], indices_after[1]) is True
Beispiel #22
0
    def _test_full_beam_decoder(self, test_args, quantize=False):
        samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        sample = next(samples)
        # [seq len, batch size=1]
        src_tokens = sample["net_input"]["src_tokens"][0:1].t()
        # [seq len]
        src_lengths = sample["net_input"]["src_lengths"][0:1].long()

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))

        length, word_length = 11, 7
        if test_args.arch in constants.ARCHS_FOR_CHAR_SOURCE:
            char_inds = torch.LongTensor(
                np.random.randint(0,
                                  126, (1, length, word_length),
                                  dtype="int64"))
            word_lengths = torch.IntTensor(
                np.array([word_length] * length, dtype="int32")).reshape(
                    (1, length))
        else:
            char_inds, word_lengths = None, None

        beam_size = 6
        bs = BeamSearch(
            model_list,
            tgt_dict,
            src_tokens,
            src_lengths,
            beam_size=beam_size,
            quantize=quantize,
            char_inds=char_inds,
            word_lengths=word_lengths,
        )
        f = io.BytesIO()
        bs.save_to_pytorch(f)

        # Test generalization with a different sequence length
        src_tokens = torch.LongTensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                       11]).unsqueeze(1)
        src_lengths = torch.LongTensor([11])
        prev_token = torch.LongTensor([0])
        prev_scores = torch.FloatTensor([0.0])
        attn_weights = torch.zeros(src_tokens.shape[0])
        prev_hypos_indices = torch.zeros(beam_size, dtype=torch.int64)

        outs = bs(
            src_tokens,
            src_lengths,
            prev_token,
            prev_scores,
            attn_weights,
            prev_hypos_indices,
            torch.LongTensor([20]),
            char_inds=char_inds,
            word_lengths=word_lengths,
        )

        f.seek(0)
        deserialized_bs = torch.jit.load(f)
        deserialized_bs.apply(lambda s: s._unpack()
                              if hasattr(s, "_unpack") else None)
        outs_deserialized = deserialized_bs(
            src_tokens,
            src_lengths,
            prev_token,
            prev_scores,
            attn_weights,
            prev_hypos_indices,
            torch.LongTensor([20]),
            char_inds=char_inds,
            word_lengths=word_lengths,
        )

        for a, b in zip(outs_deserialized, outs):
            np.testing.assert_allclose(a.detach().numpy(), b.detach().numpy())
Beispiel #23
0
    def _test_full_beam_search_decoder(self, test_args, quantize=False):
        samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        sample = next(samples)
        # [seq len, batch size=1]
        src_tokens = sample["net_input"]["src_tokens"][0:1].t()
        # [seq len]
        src_lengths = sample["net_input"]["src_lengths"][0:1].long()

        num_models = 3
        model_list = []
        for _ in range(num_models):
            model_list.append(task.build_model(test_args))

        eos_token_id = 8
        length_penalty = 0.25
        nbest = 3
        stop_at_eos = True
        num_steps = torch.LongTensor([20])

        beam_size = 6
        bsd = BeamSearchAndDecode(
            model_list,
            tgt_dict,
            src_tokens,
            src_lengths,
            eos_token_id=eos_token_id,
            length_penalty=length_penalty,
            nbest=nbest,
            beam_size=beam_size,
            stop_at_eos=stop_at_eos,
            quantize=quantize,
        )
        f = io.BytesIO()
        bsd.save_to_pytorch(f)

        # Test generalization with a different sequence length
        src_tokens = torch.LongTensor([1, 2, 3, 4, 5, 6, 7, 9, 9, 10, 11]).unsqueeze(1)
        src_lengths = torch.LongTensor([11])
        prev_token = torch.LongTensor([0])
        prev_scores = torch.FloatTensor([0.0])
        attn_weights = torch.zeros(src_tokens.shape[0])
        prev_hypos_indices = torch.zeros(beam_size, dtype=torch.int64)

        outs = bsd(
            src_tokens,
            src_lengths,
            prev_token,
            prev_scores,
            attn_weights,
            prev_hypos_indices,
            num_steps[0],
        )

        f.seek(0)
        deserialized_bsd = torch.jit.load(f)
        deserialized_bsd.apply(lambda s: s._unpack() if hasattr(s, "_unpack") else None)
        outs_deserialized = deserialized_bsd(
            src_tokens,
            src_lengths,
            prev_token,
            prev_scores,
            attn_weights,
            prev_hypos_indices,
            num_steps[0],
        )

        for hypo, hypo_deserialized in zip(outs, outs_deserialized):
            np.testing.assert_array_equal(
                hypo[0].tolist(), hypo_deserialized[0].tolist()
            )
            np.testing.assert_array_almost_equal(
                hypo[2], hypo_deserialized[2], decimal=1
            )
            np.testing.assert_array_almost_equal(
                hypo[3].numpy(), hypo_deserialized[3].numpy(), decimal=1
            )
    def test_decoder_ensemble_with_eos(self):
        """
        This is to test the functionality of DecoderBatchedStepEnsembleWithEOS class.
        We expect it generates same outputs with DecoderBatchedStepEnsemble before
        final step. At final step, it generates EOS tokens.
        """
        test_args = test_utils.ModelParamsDict(arch="rnn")
        samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)
        model = task.build_model(test_args)
        eos_token = tgt_dict.eos()

        encoder_ensemble = EncoderEnsemble([model])
        src_tokens = torch.LongTensor([4, 5, 6, 7, 8]).unsqueeze(1)
        src_lengths = torch.LongTensor([5])
        enc_inputs = (src_tokens, src_lengths)
        encoder_outputs = encoder_ensemble(*enc_inputs)

        beam_size = 8
        word_reward = 1
        unk_reward = -1
        decoder_ensemble = DecoderBatchedStepEnsemble(
            models=[model],
            tgt_dict=tgt_dict,
            beam_size=beam_size,
            word_reward=word_reward,
            unk_reward=unk_reward,
        )
        decoder_ensemble_with_eos = DecoderBatchedStepEnsembleWithEOS(
            models=[model],
            tgt_dict=tgt_dict,
            beam_size=beam_size,
            word_reward=word_reward,
            unk_reward=unk_reward,
        )

        prev_tokens = torch.LongTensor([eos_token])
        prev_scores = torch.FloatTensor([0.0])
        timestep = torch.LongTensor([0])
        final_step = torch.tensor([False], dtype=torch.bool)
        maxLen = 5
        num_steps = torch.LongTensor([maxLen])

        decoder_first_step_outputs = decoder_ensemble(prev_tokens, prev_scores,
                                                      timestep,
                                                      *encoder_outputs)

        decoder_with_eos_first_step_outputs = decoder_ensemble_with_eos(
            prev_tokens, prev_scores, timestep, final_step, *encoder_outputs)

        # Test results at first step
        self._test_base(decoder_first_step_outputs,
                        decoder_with_eos_first_step_outputs)

        (
            prev_tokens,
            prev_scores,
            prev_hypos_indices,
            attn_weights,
            *states,
        ) = decoder_first_step_outputs

        # Tile is needed after first step
        for i in range(len([model])):
            states[i] = states[i].repeat(1, beam_size, 1)

        (
            prev_tokens_with_eos,
            prev_scores_with_eos,
            prev_hypos_indices_with_eos,
            attn_weights_with_eos,
            *states_with_eos,
        ) = decoder_with_eos_first_step_outputs

        for i in range(len([model])):
            states_with_eos[i] = states_with_eos[i].repeat(1, beam_size, 1)

        for i in range(num_steps - 1):
            decoder_step_outputs = decoder_ensemble(prev_tokens, prev_scores,
                                                    torch.tensor([i + 1]),
                                                    *states)
            (
                prev_tokens,
                prev_scores,
                prev_hypos_indices,
                attn_weights,
                *states,
            ) = decoder_step_outputs
            decoder_step_with_eos_outputs = decoder_ensemble_with_eos(
                prev_tokens_with_eos,
                prev_scores_with_eos,
                torch.tensor([i + 1]),
                final_step,
                *states_with_eos,
            )
            (
                prev_tokens_with_eos,
                prev_scores_with_eos,
                prev_hypos_indices_with_eos,
                attn_weights_with_eos,
                *states_with_eos,
            ) = decoder_step_with_eos_outputs

            # Test results at each step
            self._test_base(decoder_step_outputs,
                            decoder_step_with_eos_outputs)

        # Test the outputs of final tesp
        decoder_final_with_eos_outputs = decoder_ensemble_with_eos(
            prev_tokens_with_eos,
            prev_scores_with_eos,
            torch.tensor([num_steps]),
            torch.tensor([True]),
            *states_with_eos,
        )

        np.testing.assert_array_equal(
            decoder_final_with_eos_outputs[0],
            torch.LongTensor([eos_token]).repeat(beam_size),
        )
        np.testing.assert_array_equal(
            decoder_final_with_eos_outputs[2],
            torch.LongTensor(np.array([i for i in range(beam_size)])),
        )
    def test_beam_search_and_decode_generate(self):
        """
        A basic test that the output given by BeamSearchAndDecode class
        is the same as SequenceGenerator
        """
        test_args = test_utils.ModelParamsDict(arch="rnn")
        test_args.sequence_lstm = True
        BEAM_SIZE = 1
        WORD_REWARD = 1
        UNK_REWARD = -1
        LENGTH_PENALTY = 0

        PLACEHOLDER_SEQ_LENGTH = 5
        NBEST = 2
        MAX_SEQ_LEN = 7

        src_tokens = torch.LongTensor([[0, 0, 0]])
        src_lengths = torch.LongTensor([3])

        # Build model list
        samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.DictionaryHolderTask(src_dict, tgt_dict)

        models = task.build_model(test_args)

        # Placeholder inputs for BeamSearchAndDecode
        placeholder_src_tokens = torch.LongTensor(
            np.ones((PLACEHOLDER_SEQ_LENGTH, 1), dtype="int64"))
        placeholder_src_lengths = torch.IntTensor(
            np.array([PLACEHOLDER_SEQ_LENGTH], dtype="int32"))
        prev_token = torch.LongTensor([tgt_dict.eos()])
        prev_scores = torch.FloatTensor([0.0])
        attn_weights = torch.zeros(src_lengths[0].item())
        prev_hypos_indices = torch.zeros(BEAM_SIZE, dtype=torch.int64)
        num_steps = torch.LongTensor([MAX_SEQ_LEN])

        # Generate output using SequenceGenerator
        translator = SequenceGenerator(
            [models],
            task.target_dictionary,
            beam_size=BEAM_SIZE,
            word_reward=WORD_REWARD,
            unk_reward=UNK_REWARD,
        )

        encoder_input = {"src_tokens": src_tokens, "src_lengths": src_lengths}
        top_seq_gen_hypothesis = translator.generate(encoder_input,
                                                     beam_size=BEAM_SIZE,
                                                     maxlen=MAX_SEQ_LEN)[0]

        # Generate output using BeamSearch/BeamDecode
        placeholder_src_tokens = torch.LongTensor(
            np.ones((PLACEHOLDER_SEQ_LENGTH, 1), dtype="int64"))
        placeholder_src_lengths = torch.IntTensor(
            np.array([PLACEHOLDER_SEQ_LENGTH], dtype="int32"))

        # Generate output using BeamSearchAndDecode class
        beam_search_and_decode = BeamSearchAndDecode(
            [models],
            tgt_dict=tgt_dict,
            src_tokens=placeholder_src_tokens,
            src_lengths=placeholder_src_lengths,
            eos_token_id=tgt_dict.eos(),
            length_penalty=LENGTH_PENALTY,
            nbest=NBEST,
            beam_size=BEAM_SIZE,
            stop_at_eos=True,
            word_reward=WORD_REWARD,
            unk_reward=UNK_REWARD,
            quantize=True,
        )
        beam_search_and_decode_output = beam_search_and_decode(
            src_tokens.transpose(0, 1),
            src_lengths,
            prev_token,
            prev_scores,
            attn_weights,
            prev_hypos_indices,
            num_steps[0],
        )

        for hyp_index in range(
                min(len(beam_search_and_decode_output),
                    len(top_seq_gen_hypothesis))):
            beam_search_and_decode_hypothesis = beam_search_and_decode_output[
                hyp_index]

            # Compare two outputs
            # We always look only from 0 to MAX_SEQ_LEN, because sequence generator
            # adds an EOS at the end after MAX_SEQ_LEN

            # Compare two hypotheses
            np.testing.assert_array_equal(
                top_seq_gen_hypothesis[hyp_index]["tokens"].tolist()
                [0:MAX_SEQ_LEN],
                beam_search_and_decode_hypothesis[0].tolist()[0:MAX_SEQ_LEN],
            )
            # Compare token level scores
            np.testing.assert_array_almost_equal(
                top_seq_gen_hypothesis[hyp_index]
                ["positional_scores"].tolist()[0:MAX_SEQ_LEN],
                beam_search_and_decode_hypothesis[2][0:MAX_SEQ_LEN],
                decimal=1,
            )

            # Compare attention weights
            np.testing.assert_array_almost_equal(
                top_seq_gen_hypothesis[hyp_index]["attention"].numpy()
                [:, 0:MAX_SEQ_LEN],
                beam_search_and_decode_hypothesis[3].numpy()[:, 0:MAX_SEQ_LEN],
                decimal=1,
            )
Beispiel #26
0
    def test_forward_training(self):
        """
        We test that if we shuffle the input sample, we will get the same
        forward values, both in training mode (without dropout) and in
        eval mode.
        For the meanwhile, we use an auxiliary hybrid_transformer_rnn
        in order to get the encoder output.
        """
        test_word_decoder_args = test_utils.ModelParamsDict(
            arch="hybrid_transformer_rnn")
        self.task = tasks.DictionaryHolderTask(self.word_dict, self.word_dict)
        word_model = maybe_cuda(self.task.build_model(test_word_decoder_args))
        word_model.eval()  # Make sure we do not apply dropout.

        test_args = test_utils.ModelParamsDict(arch="char_aware_hybrid")

        decoder_embed_tokens = maybe_cuda(
            transformer.build_embedding(dictionary=self.word_dict,
                                        embed_dim=10))
        decoder = maybe_cuda(
            char_aware_hybrid.CharAwareHybridRNNDecoder(
                args=test_args,
                src_dict=self.word_dict,
                dst_dict=self.word_dict,
                embed_tokens=decoder_embed_tokens,
                num_chars=len(self.char_dict),
            ))

        src_tokens = maybe_cuda(self.sample["net_input"]["src_tokens"])
        src_lengths = maybe_cuda(self.sample["net_input"]["src_lengths"])
        prev_output_chars = maybe_cuda(
            self.sample["net_input"]["prev_output_chars"][:,
                                                          -1:, :].squeeze(1))
        prev_output_tokens = maybe_cuda(
            self.sample["net_input"]["prev_output_tokens"][:, 0:1])

        encoder_out = word_model.encoder(src_tokens, src_lengths)

        embed_output = decoder._embed_prev_outputs(
            prev_output_tokens=prev_output_tokens,
            prev_output_chars=prev_output_chars)[0]
        forward_output = decoder(
            prev_output_tokens=prev_output_tokens,
            encoder_out=encoder_out,
            prev_output_chars=prev_output_chars,
        )
        output_logits = forward_output[0]

        prev_output_tokens_shuffled = torch.cat(
            [prev_output_tokens[1:], prev_output_tokens[0].unsqueeze(0)],
            dim=0)
        prev_output_chars_shuffled = torch.cat(
            [prev_output_chars[1:], prev_output_chars[0].unsqueeze(0)], dim=0)
        src_tokens_shuffled = torch.cat(
            [src_tokens[1:], src_tokens[0].unsqueeze(0)], dim=0)

        # Making sure shuffling is done correctly.
        assert torch.equal(src_tokens[0], src_tokens_shuffled[2])
        assert torch.equal(src_tokens[1], src_tokens_shuffled[0])
        assert torch.equal(src_tokens[2], src_tokens_shuffled[1])
        assert torch.equal(prev_output_chars[0], prev_output_chars_shuffled[2])
        assert torch.equal(prev_output_chars[1], prev_output_chars_shuffled[0])
        assert torch.equal(prev_output_chars[2], prev_output_chars_shuffled[1])
        assert torch.equal(prev_output_tokens[0],
                           prev_output_tokens_shuffled[2])
        assert torch.equal(prev_output_tokens[1],
                           prev_output_tokens_shuffled[0])
        assert torch.equal(prev_output_tokens[2],
                           prev_output_tokens_shuffled[1])

        # Making sure that we embed the inputs correctly.
        encoder_out_shuffled = word_model.encoder(src_tokens_shuffled,
                                                  src_lengths)
        embed_output_shuffled = decoder._embed_prev_outputs(
            prev_output_tokens=prev_output_tokens_shuffled,
            prev_output_chars=prev_output_chars_shuffled,
        )[0]
        assert embed_output[0, 0].equal(embed_output_shuffled[0, 2])
        assert embed_output[0, 1].equal(embed_output_shuffled[0, 0])
        assert embed_output[0, 2].equal(embed_output_shuffled[0, 1])

        # Making sure the output of the forward function is correct.
        forward_output_shuffled = decoder(
            prev_output_tokens=prev_output_tokens_shuffled,
            encoder_out=encoder_out_shuffled,
            prev_output_chars=prev_output_chars_shuffled,
        )
        output_logits_shuffled = forward_output_shuffled[0]

        assert encoder_out[0][:, 0, :].equal(encoder_out_shuffled[0][:, 2, :])
        assert encoder_out[0][:, 1, :].equal(encoder_out_shuffled[0][:, 0, :])
        assert encoder_out[0][:, 2, :].equal(encoder_out_shuffled[0][:, 1, :])

        assert output_logits[0].equal(output_logits_shuffled[2])
        assert output_logits[1].equal(output_logits_shuffled[0])
        assert output_logits[2].equal(output_logits_shuffled[1])
        """
        Now trying in the eval mode.
        """
        decoder.eval()
        forward_output = decoder(
            prev_output_tokens=prev_output_tokens,
            encoder_out=encoder_out,
            prev_output_chars=prev_output_chars,
        )
        output_logits = forward_output[0]
        forward_output_shuffled = decoder(
            prev_output_tokens=prev_output_tokens_shuffled,
            encoder_out=encoder_out_shuffled,
            prev_output_chars=prev_output_chars_shuffled,
        )
        output_logits_shuffled = forward_output_shuffled[0]
        assert output_logits[0].equal(output_logits_shuffled[2])
        assert output_logits[1].equal(output_logits_shuffled[0])
        assert output_logits[2].equal(output_logits_shuffled[1])