def setUp(self):
        self.tgt_dict, self.w1, self.w2, self.src_tokens, self.src_lengths, self.model = (
            test_utils.sequence_generator_setup())

        dummy_src_samples = self.src_tokens

        self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples)
    def setUp(self):
        (
            self.tgt_dict,
            self.w1,
            self.w2,
            self.src_tokens,
            self.src_lengths,
            self.model,
        ) = test_utils.sequence_generator_setup()

        dummy_src_samples = self.src_tokens

        self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples)
        self.cuda = torch.cuda.is_available()
    def setUp(self):
        self.tgt_dict, self.w1, self.w2, self.src_tokens, self.src_lengths, self.model = (
            test_utils.sequence_generator_setup())
        backtranslation_args = argparse.Namespace()
        """
        Same as defaults from fairseq/options.py
        """
        backtranslation_args.backtranslation_unkpen = 0
        backtranslation_args.backtranslation_sampling = False
        backtranslation_args.backtranslation_max_len_a = 0
        backtranslation_args.backtranslation_max_len_b = 200
        backtranslation_args.backtranslation_beam = 2

        self.backtranslation_args = backtranslation_args

        dummy_src_samples = self.src_tokens

        self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples)
Esempio n. 4
0
    def _get_noising_dataset_batch(
        self,
        src_tokens_no_pad,
        src_dict,
        append_eos_to_tgt=False,
    ):
        """
        Constructs a NoisingDataset and the corresponding
        ``LanguagePairDataset(NoisingDataset(src), src)``. If
        *append_eos_to_tgt* is True, wrap the source dataset in
        :class:`TransformEosDataset` to append EOS to the clean source when
        using it as the target.
        """
        src_dataset = test_utils.TestDataset(data=src_tokens_no_pad)

        noising_dataset = noising.NoisingDataset(
            src_dataset=src_dataset,
            src_dict=src_dict,
            seed=1234,
            max_word_shuffle_distance=3,
            word_dropout_prob=0.2,
            word_blanking_prob=0.2,
            noising_class=noising.UnsupervisedMTNoising,
        )
        tgt = src_dataset
        language_pair_dataset = LanguagePairDataset(src=noising_dataset,
                                                    tgt=tgt,
                                                    src_sizes=None,
                                                    src_dict=src_dict)
        language_pair_dataset = TransformEosDataset(
            language_pair_dataset,
            src_dict.eos(),
            append_eos_to_tgt=append_eos_to_tgt,
        )

        dataloader = torch.utils.data.DataLoader(
            dataset=language_pair_dataset,
            batch_size=2,
            collate_fn=language_pair_dataset.collater,
        )
        denoising_batch_result = next(iter(dataloader))
        return denoising_batch_result
Esempio n. 5
0
    def test_eval_dataloader(self):
        dictionary = test_utils.dummy_dictionary(10)
        assert len(dictionary) == 14  # 4 extra special symbols
        assert dictionary.pad() == 1

        dataset = test_utils.TestDataset([
            torch.tensor([4, 5, 6, 7], dtype=torch.long),
            torch.tensor([8, 9, 10, 11], dtype=torch.long),
            torch.tensor([12, 13], dtype=torch.long),
        ])
        dataset = MonolingualDataset(dataset,
                                     sizes=[4, 4, 2],
                                     src_vocab=dictionary)

        config = LanguageModelingConfig(tokens_per_sample=4)
        task = LanguageModelingTask(config, dictionary)

        eval_dataloader = task.eval_lm_dataloader(
            dataset=dataset,
            batch_size=1,
            context_window=2,
            num_workers=0,
        )

        batch = next(eval_dataloader)
        assert batch["net_input"]["src_tokens"][0].tolist() == [
            4, 5, 6, 7, 1, 1
        ]
        assert batch["target"][0].tolist() == [4, 5, 6, 7, 1, 1]

        batch = next(eval_dataloader)
        assert batch["net_input"]["src_tokens"][0].tolist() == [
            6, 7, 8, 9, 10, 11
        ]
        assert batch["target"][0].tolist() == [1, 1, 8, 9, 10, 11]

        batch = next(eval_dataloader)
        assert batch["net_input"]["src_tokens"][0].tolist() == [10, 11, 12, 13]
        assert batch["target"][0].tolist() == [1, 1, 12, 13]
Esempio n. 6
0
    def _get_noising_dataset_batch(self,
                                   src_tokens_no_pad,
                                   src_dict,
                                   use_append_eos_dataset=False):
        """
        Constructs a NoisingDataset and the corresponding
        LanguagePairDataset(NoisingDataset(src), src). If we set
        use_append_eos_dataset to True, wrap the source dataset in
        AppendEosDataset to append EOS to the clean source when using it as the
        target. In practice, we should use AppendEosDataset because our models
        usually have source without EOS but target with EOS.
        """
        src_dataset = test_utils.TestDataset(data=src_tokens_no_pad)

        noising_dataset = noising.NoisingDataset(
            src_dataset=src_dataset,
            src_dict=src_dict,
            seed=1234,
            max_word_shuffle_distance=3,
            word_dropout_prob=0.2,
            word_blanking_prob=0.2,
            noising_class=noising.UnsupervisedMTNoising,
        )
        tgt = src_dataset
        if use_append_eos_dataset:
            tgt = AppendEosDataset(src_dataset, src_dict.eos())
        language_pair_dataset = LanguagePairDataset(src=noising_dataset,
                                                    tgt=tgt,
                                                    src_sizes=None,
                                                    src_dict=src_dict)

        dataloader = torch.utils.data.DataLoader(
            dataset=language_pair_dataset,
            batch_size=2,
            collate_fn=language_pair_dataset.collater,
        )
        denoising_batch_result = next(iter(dataloader))
        return denoising_batch_result
Esempio n. 7
0
 def _build_dataset(self, data, **kwargs):
     sizes = [len(x) for x in data]
     underlying_ds = test_utils.TestDataset(data)
     return TokenBlockDataset(underlying_ds, sizes, **kwargs)