Beispiel #1
0
 def test_load_data_single_path(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     src_text_file, tgt_text_file = test_utils.create_test_text_files()
     src_bin_path = preprocess.binarize_text_file(
         text_file=src_text_file,
         dictionary=src_dict,
         output_path=tempfile.NamedTemporaryFile().name,
         append_eos=True,
         reverse_order=False,
     )
     tgt_bin_path = preprocess.binarize_text_file(
         text_file=tgt_text_file,
         dictionary=tgt_dict,
         output_path=tempfile.NamedTemporaryFile().name,
         append_eos=True,
         reverse_order=False,
     )
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "0"
     task.load_dataset(split, src_bin_path, tgt_bin_path)
     self.assertEqual(len(task.datasets[split]), 4)
     self.assertIsInstance(task.datasets[split], LanguagePairDataset)
    def test_combine_weighted_scores(self):
        test_args = test_utils.ModelParamsDict()
        test_args.enable_rescoring = True
        test_args.length_penalty = 1
        test_args.original_model_weight = 1
        test_args.l2r_model_path = ""
        test_args.l2r_model_weight = 1
        test_args.r2l_model_weight = 0
        test_args.reverse_model_weight = 0.5
        test_args.lm_model_weight = 0.75
        test_args.length_penalty = 1

        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        model = task.build_model(test_args)
        with patch(
                "pytorch_translate.utils.load_diverse_ensemble_for_inference",
                return_value=([model], test_args, task),
        ):
            rescorer = Rescorer(test_args)

            scores = torch.tensor([[10, 20, 30, 40]], dtype=torch.float)
            src_tokens = torch.tensor([1, 2, 3, 4, 5])
            hypos = [{"tokens": torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])}]
            rescorer.combine_weighted_scores(scores, src_tokens, hypos)

            # 10=1. , 20*0=0. , 30*(0.5/5)=3. , 40*(0.75/5)=6.
            expected = torch.tensor([[10.0, 0.0, 3.0, 6.0]], dtype=torch.float)
            assert torch.equal(scores, expected)
Beispiel #3
0
 def test_load_data_multi_path(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     num_paths = 4
     src_bin_path, tgt_bin_path = {}, {}
     for i in range(num_paths):
         src_text_file, tgt_text_file = test_utils.create_test_text_files()
         src_bin_path[i] = preprocess.binarize_text_file(
             text_file=src_text_file,
             dictionary=src_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
         tgt_bin_path[i] = preprocess.binarize_text_file(
             text_file=tgt_text_file,
             dictionary=tgt_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "1"
     task.load_dataset(split, src_bin_path, tgt_bin_path)
     self.assertEqual(len(task.datasets[split]), 16)
     self.assertIsInstance(task.datasets[split], MultiCorpusSampledDataset)
Beispiel #4
0
    def test_compute_scores(self):
        # TODO(halilakin): Verify behaviour in batch mode
        test_args = test_utils.ModelParamsDict()
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        model = task.build_model(test_args)

        with patch(
                "pytorch_translate.utils.load_diverse_ensemble_for_inference",
                return_value=([model], test_args, task),
        ):
            scorer = SimpleModelScorer(test_args, "/tmp/model_path.txt")
            tgt_tokens = torch.tensor([[2, 11, 22, 0], [2, 33, 44, 55]])
            logprobs = torch.zeros(tgt_tokens.shape[0], tgt_tokens.shape[1],
                                   len(tgt_dict))
            logprobs[0, 0, 11] = 0.5
            logprobs[0, 1, 22] = 1.5
            logprobs[0, 3, :] = 5

            logprobs[1, 0, 33] = 0.5
            logprobs[1, 1, 44] = 1.5
            logprobs[1, 2, 55] = 2.5

            hypos_scores = scorer.compute_scores(tgt_tokens, logprobs)
            assert hypos_scores[0] == 2.0
            assert hypos_scores[1] == 4.5
Beispiel #5
0
 def test_load_data_multi_path(self):
     num_paths = 4
     test_args, src_dict, tgt_dict, src_bin_path, tgt_bin_path = self._prepare_data_multi_path(
         num_paths)
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "1"
     task.load_dataset(split, src_bin_path, tgt_bin_path)
     self.assertEqual(len(task.datasets[split]), 16)
     self.assertIsInstance(task.datasets[split], ConcatDataset)
Beispiel #6
0
 def test_load_data_noising(self):
     num_paths = 4
     test_args, src_dict, tgt_dict, src_bin_path, tgt_bin_path = self._prepare_data_multi_path(
         num_paths)
     test_args.word_dropout_prob_map = str({"en-fr": {0: 0.1}})
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "1"
     task.load_dataset(split, src_bin_path, tgt_bin_path)
     self.assertEqual(len(task.datasets[split]), 16)
     self.assertIsInstance(task.datasets[split].datasets[0].src,
                           NoisingDataset)
Beispiel #7
0
 def test_reverse_tgt_tokens(self):
     test_args = test_utils.ModelParamsDict()
     _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     model = task.build_model(test_args)
     with patch(
             "pytorch_translate.utils.load_diverse_ensemble_for_inference",
             return_value=([model], test_args, task),
     ):
         scorer = R2LModelScorer(test_args, "/tmp/model_path.txt")
         pad = task.tgt_dict.pad()
         tgt_tokens = torch.Tensor([[1, 2, 3], [1, 2, pad], [1, pad, pad]])
         expected_tokens = torch.Tensor([[3, 2, 1], [2, 1, pad],
                                         [1, pad, pad]])
         reversed_tgt_tokens = scorer.reverse_tgt_tokens(tgt_tokens)
         assert torch.equal(reversed_tgt_tokens, expected_tokens)
Beispiel #8
0
 def test_load_data_single_path_idx_bin(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     src_text_file, tgt_text_file = test_utils.create_test_text_files()
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     with tempfile.TemporaryDirectory() as destdir:
         preprocess_args = [
             "--source-lang",
             test_args.source_lang,
             "--target-lang",
             test_args.target_lang,
             "--destdir",
             destdir,
         ]
         preproc_parser = preprocess_options.get_preprocessing_parser()
         preproc_args = preproc_parser.parse_args(preprocess_args)
         preproc_args.dataset_impl = "mmap"
         split = "train"
         binarize(
             preproc_args,
             src_text_file,
             src_dict,
             split,
             test_args.source_lang,
             offset=0,
             end=-1,
         )
         binarize(
             preproc_args,
             tgt_text_file,
             tgt_dict,
             split,
             test_args.target_lang,
             offset=0,
             end=-1,
         )
         src_path = dataset_dest_prefix(preproc_args, split,
                                        test_args.source_lang)
         tgt_path = dataset_dest_prefix(preproc_args, split,
                                        test_args.target_lang)
         task.load_dataset(split, src_path, tgt_path, is_npz=False)
         self.assertEqual(len(task.datasets[split]), 4)
         self.assertIsInstance(task.datasets[split], LanguagePairDataset)
Beispiel #9
0
    def test_combine_weighted_scores(self):
        test_args = test_utils.ModelParamsDict()
        test_args.enable_rescoring = True
        test_args.length_penalty = 1
        test_args.l2r_model_path = ""
        test_args.l2r_model_weight = 1.0
        test_args.r2l_model_weight = 0.0
        test_args.reverse_model_weight = 0.0
        test_args.cloze_transformer_weight = 0.0
        test_args.lm_model_weight = 1.01
        test_args.length_penalty = 1.0

        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        model = task.build_model(test_args)
        with patch(
                "pytorch_translate.utils.load_diverse_ensemble_for_inference",
                return_value=([model], test_args, task),
        ):

            scores = torch.tensor([[80, 0, 0, 0, 0], [0, 0, 0, 80, 0]],
                                  dtype=torch.float)
            src_tokens = torch.tensor([1, 2, 3, 4, 5])
            hypos = [{
                "tokens": torch.tensor([1, 2])
            }, {
                "tokens": torch.tensor([1, 2])
            }]

            src_len = len(src_tokens)
            tgt_len = torch.tensor([len(hypo["tokens"]) for hypo in hypos],
                                   dtype=torch.float)
            weights = [
                test_args.l2r_model_weight,
                test_args.r2l_model_weight,
                test_args.reverse_model_weight,
                test_args.lm_model_weight,
                test_args.cloze_transformer_weight,
            ]
            combined_scores = combine_weighted_scores(scores, weights, src_len,
                                                      tgt_len, 1)

            # 80/(2^1), 0, 0, 80*1.01/(2^1)
            expected = torch.tensor([40.0, 40.4], dtype=torch.float)
            assert torch.equal(combined_scores, expected)
Beispiel #10
0
    def test_reverse_scorer_prepare_inputs(self):
        test_args = test_utils.ModelParamsDict()
        test_args.append_eos_to_source = True
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        model = task.build_model(test_args)

        pad = task.tgt_dict.pad()
        eos = task.tgt_dict.eos()

        with patch(
                "pytorch_translate.utils.load_diverse_ensemble_for_inference",
                return_value=([model], test_args, task),
        ):
            scorer = ReverseModelScorer(test_args, "/tmp/model_path.txt", None,
                                        task)
            src_tokens = torch.tensor([6, 7, 8], dtype=torch.int)
            hypos = [
                {
                    "tokens": torch.tensor([12, 13, 14, eos], dtype=torch.int)
                },
                {
                    "tokens": torch.tensor([22, 23, eos], dtype=torch.int)
                },
            ]

            (encoder_inputs,
             tgt_tokens) = scorer.prepare_inputs(src_tokens, hypos)

            # Test encoder inputs
            assert torch.equal(
                encoder_inputs[0],
                torch.tensor([[12, 13, 14, eos], [22, 23, eos, pad]],
                             dtype=torch.int),
            ), "Encoder inputs are not as expected"
            max_tgt_len = max(len(hypo["tokens"]) for hypo in hypos)
            assert encoder_inputs[1][
                0] == max_tgt_len, " Src length is not as expected"

            # Test target tokens
            assert torch.equal(
                tgt_tokens,
                torch.tensor([[eos, 6, 7, 8, eos], [eos, 6, 7, 8, eos]],
                             dtype=torch.int),
            ), "Target tokens are not as expected"
Beispiel #11
0
 def test_load_data_noising(self):
     test_args = test_utils.ModelParamsDict()
     test_args.source_lang = "en"
     test_args.target_lang = "fr"
     test_args.log_verbose = False
     src_dict, tgt_dict = test_utils.create_vocab_dictionaries()
     num_paths = 4
     src_bin_path, tgt_bin_path = {}, {}
     for i in range(num_paths):
         src_text_file, tgt_text_file = test_utils.create_test_text_files()
         src_bin_path[i] = preprocess.binarize_text_file(
             text_file=src_text_file,
             dictionary=src_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
         tgt_bin_path[i] = preprocess.binarize_text_file(
             text_file=tgt_text_file,
             dictionary=tgt_dict,
             output_path=tempfile.NamedTemporaryFile().name,
             append_eos=True,
             reverse_order=False,
         )
     task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
     split = "1"
     task.load_dataset(
         split,
         src_bin_path,
         tgt_bin_path,
         noiser={
             0:
             UnsupervisedMTNoising(
                 dictionary=src_dict,
                 max_word_shuffle_distance=3,
                 word_dropout_prob=0.2,
                 word_blanking_prob=0.2,
             )
         },
     )
     self.assertEqual(len(task.datasets[split]), 16)
     self.assertIsInstance(task.datasets[split].datasets[0].src,
                           NoisingDataset)
    def test_convert_hypos_to_tgt_tokens(self):
        test_args = test_utils.ModelParamsDict()
        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        model = task.build_model(test_args)

        with patch(
                "pytorch_translate.utils.load_diverse_ensemble_for_inference",
                return_value=([model], test_args, task),
        ):
            scorer = SimpleModelScorer(test_args, None)

            hypos = [
                {
                    "tokens": torch.Tensor([1, 2, 3, 4, 5])
                },
                {
                    "tokens": torch.Tensor([1, 2, 3, 4])
                },
                {
                    "tokens": torch.Tensor([1, 2, 3])
                },
                {
                    "tokens": torch.Tensor([1, 2])
                },
                {
                    "tokens": torch.Tensor([1])
                },
            ]
            tgt_tokens = scorer.convert_hypos_to_tgt_tokens(hypos)

            pad = task.tgt_dict.pad()
            eos = task.tgt_dict.eos()
            expected_tgt_tokens = torch.Tensor([
                [eos, 1, 2, 3, 4, 5],
                [eos, 1, 2, 3, 4, pad],
                [eos, 1, 2, 3, pad, pad],
                [eos, 1, 2, pad, pad, pad],
                [eos, 1, pad, pad, pad, pad],
            ]).type_as(tgt_tokens)
            assert torch.equal(tgt_tokens, expected_tgt_tokens)
    def test_model_passing_as_parameter(self):
        test_args = test_utils.ModelParamsDict("transformer")
        test_args.enable_rescoring = True
        test_args.length_penalty = 1
        test_args.l2r_model_weight = 1.0
        test_args.r2l_model_weight = 0.0
        test_args.reverse_model_weight = 0.0
        test_args.lm_model_weight = 1.01
        test_args.cloze_transformer_weight = 1.0
        test_args.length_penalty = 1.0

        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        model = task.build_model(test_args)
        src_tokens = torch.tensor([[1, 2, 3, 4, 5]])
        hypos = [{"tokens": torch.tensor([1, 2])}, {"tokens": torch.tensor([1, 2])}]
        rescorer = Rescorer(
            test_args, task, {"l2r_model": {"model": model, "task": task}}
        )
        scores = rescorer.score(src_tokens, hypos)
        assert scores.size()[1] == 5
    def test_batch_computation(self):
        test_args = test_utils.ModelParamsDict("transformer")
        test_args.enable_rescoring = True
        test_args.length_penalty = 1
        test_args.l2r_model_path = "/tmp/test_rescorer_model.pt"
        test_args.l2r_model_weight = 1.0
        test_args.r2l_model_weight = 0.0
        test_args.reverse_model_weight = 0.0
        test_args.cloze_transformer_weight = 1.0
        test_args.lm_model_weight = 0.0
        test_args.length_penalty = 1.0

        _, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
        task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict)
        model = task.build_model(test_args)
        torch.save(model, test_args.l2r_model_path)
        with patch(
            "pytorch_translate.utils.load_diverse_ensemble_for_inference",
            return_value=([model], test_args, task),
        ):
            rescorer = Rescorer(test_args)
            src_tokens = torch.tensor([[1, 3, 3, 4, 2], [1, 3, 2, 0, 0]])
            hypos = [
                {"tokens": torch.tensor([1, 5, 2])},
                {"tokens": torch.tensor([6, 3, 5, 2])},
                {"tokens": torch.tensor([1, 2])},
                {"tokens": torch.tensor([1, 5, 6, 2])},
            ]
            scores = rescorer.score(src_tokens, hypos)

            src_tokens = torch.tensor([[1, 3, 3, 4, 2]])
            hypos = [
                {"tokens": torch.tensor([1, 5, 2])},
                {"tokens": torch.tensor([6, 3, 5, 2])},
            ]
            scores_single = rescorer.score(src_tokens, hypos)

            assert torch.equal(scores[0], scores_single[0])
Beispiel #15
0
 def setUp(self):
     self.args = test_utils.ModelParamsDict()
     _, src_dict, tgt_dict = test_utils.prepare_inputs(self.args)
     self.task = tasks.PytorchTranslateTask(self.args, src_dict, tgt_dict)
     self.model = self.task.build_model(self.args)