コード例 #1
0
    def _get_noising_dataset_batch(
        self,
        src_tokens_no_pad,
        src_dict,
        append_eos_to_tgt=False,
    ):
        """
        Constructs a NoisingDataset and the corresponding
        ``LanguagePairDataset(NoisingDataset(src), src)``. If
        *append_eos_to_tgt* is True, wrap the source dataset in
        :class:`TransformEosDataset` to append EOS to the clean source when
        using it as the target.
        """
        src_dataset = test_utils.TestDataset(data=src_tokens_no_pad)

        noising_dataset = noising.NoisingDataset(
            src_dataset=src_dataset,
            src_dict=src_dict,
            seed=1234,
            max_word_shuffle_distance=3,
            word_dropout_prob=0.2,
            word_blanking_prob=0.2,
            noising_class=noising.UnsupervisedMTNoising,
        )
        tgt = src_dataset
        language_pair_dataset = LanguagePairDataset(src=noising_dataset,
                                                    tgt=tgt,
                                                    src_sizes=None,
                                                    src_dict=src_dict)
        language_pair_dataset = TransformEosDataset(
            language_pair_dataset,
            src_dict.eos(),
            append_eos_to_tgt=append_eos_to_tgt,
        )

        dataloader = torch.utils.data.DataLoader(
            dataset=language_pair_dataset,
            batch_size=2,
            collate_fn=language_pair_dataset.collater,
        )
        denoising_batch_result = next(iter(dataloader))
        return denoising_batch_result
コード例 #2
0
ファイル: test_noising.py プロジェクト: cavdard/fairseq
    def _get_noising_dataset_batch(self,
                                   src_tokens_no_pad,
                                   src_dict,
                                   use_append_eos_dataset=False):
        """
        Constructs a NoisingDataset and the corresponding
        LanguagePairDataset(NoisingDataset(src), src). If we set
        use_append_eos_dataset to True, wrap the source dataset in
        AppendEosDataset to append EOS to the clean source when using it as the
        target. In practice, we should use AppendEosDataset because our models
        usually have source without EOS but target with EOS.
        """
        src_dataset = test_utils.TestDataset(data=src_tokens_no_pad)

        noising_dataset = noising.NoisingDataset(
            src_dataset=src_dataset,
            src_dict=src_dict,
            seed=1234,
            max_word_shuffle_distance=3,
            word_dropout_prob=0.2,
            word_blanking_prob=0.2,
            noising_class=noising.UnsupervisedMTNoising,
        )
        tgt = src_dataset
        if use_append_eos_dataset:
            tgt = AppendEosDataset(src_dataset, src_dict.eos())
        language_pair_dataset = LanguagePairDataset(src=noising_dataset,
                                                    tgt=tgt,
                                                    src_sizes=None,
                                                    src_dict=src_dict)

        dataloader = torch.utils.data.DataLoader(
            dataset=language_pair_dataset,
            batch_size=2,
            collate_fn=language_pair_dataset.collater,
        )
        denoising_batch_result = next(iter(dataloader))
        return denoising_batch_result
コード例 #3
0
    def load_dataset(self,
                     split,
                     src_bin_path,
                     tgt_bin_path,
                     seed=None,
                     use_noiser=False):
        """
        Load a dataset split. Seed and noiser are only used for loading train
        data, not eval data.
        """
        parallel_dataset, src_dataset, tgt_dataset = data_utils.load_parallel_dataset(
            source_lang=self.source_lang,
            target_lang=self.target_lang,
            src_bin_path=src_bin_path,
            tgt_bin_path=tgt_bin_path,
            source_dictionary=self.source_dictionary,
            target_dictionary=self.target_dictionary,
            split=split,
            remove_eos_from_source=not self.args.append_eos_to_source,
            append_eos_to_target=True,
            char_source_dict=self.char_source_dict,
            log_verbose=self.args.log_verbose,
        )

        dataset_map = OrderedDict([(f"{self.source_lang}-{self.target_lang}",
                                    parallel_dataset)])

        if use_noiser:
            if getattr(self.args, "denoising_source_parallel", False):
                dataset_map[(
                    f"{self.source_lang}-{self.source_lang}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=src_dataset,
                        src_dict=self.source_dictionary,
                        seed=seed,
                        noiser=self.source_noiser,
                    ),
                    tgt=src_dataset,
                    src_sizes=src_dataset.sizes,
                    src_dict=self.source_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )
            if getattr(self.args, "denoising_target_parallel", False):
                dataset_map[(
                    f"{self.target_lang}-{self.target_lang}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=tgt_dataset,
                        src_dict=self.target_dictionary,
                        seed=seed,
                        noiser=self.target_noiser,
                    ),
                    tgt=tgt_dataset,
                    src_sizes=tgt_dataset.sizes,
                    src_dict=self.target_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )

            if getattr(self.args, "denoising_source_mono", False):
                source_mono_dataset = self.load_monolingual_dataset(
                    self.args.train_mono_source_binary_path)
                dataset_map[(
                    f"{self.source_lang}-{self.source_lang}_"
                    f"{constants.MONOLINGUAL_DATA_IDENTIFIER}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=source_mono_dataset,
                        src_dict=self.source_dictionary,
                        seed=seed,
                        noiser=self.source_noiser,
                    ),
                    tgt=source_mono_dataset,
                    src_sizes=source_mono_dataset.sizes,
                    src_dict=self.source_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )
            if getattr(self.args, "denoising_target_mono", False):
                target_mono_dataset = self.load_monolingual_dataset(
                    self.args.train_mono_target_binary_path)
                dataset_map[(
                    f"{self.target_lang}-{self.target_lang}_"
                    f"{constants.MONOLINGUAL_DATA_IDENTIFIER}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=target_mono_dataset,
                        src_dict=self.target_dictionary,
                        seed=seed,
                        noiser=self.target_noiser,
                    ),
                    tgt=target_mono_dataset,
                    src_sizes=target_mono_dataset.sizes,
                    src_dict=self.target_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )

        self.datasets[split] = RoundRobinZipDatasets(dataset_map)

        if self.args.log_verbose:
            print("Finished loading dataset", flush=True)

        print(f"| {split} {len(self.datasets[split])} datasets")
コード例 #4
0
    def load_dataset(self,
                     split,
                     src_bin_path,
                     tgt_bin_path,
                     seed=None,
                     noiser=None):
        """
        Load a dataset split. Seed and noiser are only used for loading train
        data, not eval data.
        """

        corpus = pytorch_translate_data.ParallelCorpusConfig(
            source=pytorch_translate_data.CorpusConfig(
                dialect=self.source_lang, data_file=src_bin_path),
            target=pytorch_translate_data.CorpusConfig(
                dialect=self.target_lang, data_file=tgt_bin_path),
            weights_file=None,
        )

        if self.args.log_verbose:
            print("Starting to load binarized data files.", flush=True)
        data_utils.validate_corpus_exists(corpus=corpus, split=split)

        tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            corpus.target.data_file)
        if self.char_source_dict is not None:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                corpus.source.data_file)
        else:
            src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                corpus.source.data_file)
        parallel_dataset = weighted_data.WeightedLanguagePairDataset(
            src=src_dataset,
            src_sizes=src_dataset.sizes,
            src_dict=self.source_dictionary,
            tgt=tgt_dataset,
            tgt_sizes=tgt_dataset.sizes,
            tgt_dict=self.target_dictionary,
            remove_eos_from_source=not self.args.append_eos_to_source,
            append_eos_to_target=True,
        )
        dataset_map = OrderedDict([(f"{self.source_lang}-{self.target_lang}",
                                    parallel_dataset)])

        if noiser is not None:
            if getattr(self.args, "denoising_source_parallel", False):
                dataset_map[(
                    f"{self.source_lang}-{self.source_lang}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=src_dataset,
                        src_dict=self.source_dictionary,
                        seed=seed,
                        noiser=self.noiser,
                    ),
                    tgt=src_dataset,
                    src_sizes=src_dataset.sizes,
                    src_dict=self.source_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )
            if getattr(self.args, "denoising_target_parallel", False):
                dataset_map[(
                    f"{self.target_lang}-{self.target_lang}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=tgt_dataset,
                        src_dict=self.target_dictionary,
                        seed=seed,
                        noiser=self.noiser,
                    ),
                    tgt=tgt_dataset,
                    src_sizes=tgt_dataset.sizes,
                    src_dict=self.target_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )

            if getattr(self.args, "denoising_source_mono", False):
                source_mono_dataset = self.load_monolingual_dataset(
                    self.args.train_mono_source_binary_path)
                dataset_map[(
                    f"{self.source_lang}-{self.source_lang}_"
                    f"{constants.MONOLINGUAL_DATA_IDENTIFIER}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=source_mono_dataset,
                        src_dict=self.source_dictionary,
                        seed=seed,
                        noiser=self.noiser,
                    ),
                    tgt=source_mono_dataset,
                    src_sizes=source_mono_dataset.sizes,
                    src_dict=self.source_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )
            if getattr(self.args, "denoising_target_mono", False):
                target_mono_dataset = self.load_monolingual_dataset(
                    self.args.train_mono_target_binary_path)
                dataset_map[(
                    f"{self.target_lang}-{self.target_lang}_"
                    f"{constants.MONOLINGUAL_DATA_IDENTIFIER}"
                )] = weighted_data.WeightedLanguagePairDataset(
                    src=noising.NoisingDataset(
                        src_dataset=target_mono_dataset,
                        src_dict=self.target_dictionary,
                        seed=seed,
                        noiser=self.noiser,
                    ),
                    tgt=target_mono_dataset,
                    src_sizes=target_mono_dataset.sizes,
                    src_dict=self.target_dictionary,
                    remove_eos_from_source=not self.args.append_eos_to_source,
                    append_eos_to_target=True,
                )

        self.datasets[split] = RoundRobinZipDatasets(dataset_map)

        if self.args.log_verbose:
            print("Finished loading dataset", flush=True)

        print(f"| {split} {len(self.datasets[split])} datasets")