def _get_noising_dataset_batch( self, src_tokens_no_pad, src_dict, append_eos_to_tgt=False, ): """ Constructs a NoisingDataset and the corresponding ``LanguagePairDataset(NoisingDataset(src), src)``. If *append_eos_to_tgt* is True, wrap the source dataset in :class:`TransformEosDataset` to append EOS to the clean source when using it as the target. """ src_dataset = test_utils.TestDataset(data=src_tokens_no_pad) noising_dataset = noising.NoisingDataset( src_dataset=src_dataset, src_dict=src_dict, seed=1234, max_word_shuffle_distance=3, word_dropout_prob=0.2, word_blanking_prob=0.2, noising_class=noising.UnsupervisedMTNoising, ) tgt = src_dataset language_pair_dataset = LanguagePairDataset(src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict) language_pair_dataset = TransformEosDataset( language_pair_dataset, src_dict.eos(), append_eos_to_tgt=append_eos_to_tgt, ) dataloader = torch.utils.data.DataLoader( dataset=language_pair_dataset, batch_size=2, collate_fn=language_pair_dataset.collater, ) denoising_batch_result = next(iter(dataloader)) return denoising_batch_result
def _get_noising_dataset_batch(self, src_tokens_no_pad, src_dict, use_append_eos_dataset=False): """ Constructs a NoisingDataset and the corresponding LanguagePairDataset(NoisingDataset(src), src). If we set use_append_eos_dataset to True, wrap the source dataset in AppendEosDataset to append EOS to the clean source when using it as the target. In practice, we should use AppendEosDataset because our models usually have source without EOS but target with EOS. """ src_dataset = test_utils.TestDataset(data=src_tokens_no_pad) noising_dataset = noising.NoisingDataset( src_dataset=src_dataset, src_dict=src_dict, seed=1234, max_word_shuffle_distance=3, word_dropout_prob=0.2, word_blanking_prob=0.2, noising_class=noising.UnsupervisedMTNoising, ) tgt = src_dataset if use_append_eos_dataset: tgt = AppendEosDataset(src_dataset, src_dict.eos()) language_pair_dataset = LanguagePairDataset(src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict) dataloader = torch.utils.data.DataLoader( dataset=language_pair_dataset, batch_size=2, collate_fn=language_pair_dataset.collater, ) denoising_batch_result = next(iter(dataloader)) return denoising_batch_result
def load_dataset(self, split, src_bin_path, tgt_bin_path, seed=None, use_noiser=False): """ Load a dataset split. Seed and noiser are only used for loading train data, not eval data. """ parallel_dataset, src_dataset, tgt_dataset = data_utils.load_parallel_dataset( source_lang=self.source_lang, target_lang=self.target_lang, src_bin_path=src_bin_path, tgt_bin_path=tgt_bin_path, source_dictionary=self.source_dictionary, target_dictionary=self.target_dictionary, split=split, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, char_source_dict=self.char_source_dict, log_verbose=self.args.log_verbose, ) dataset_map = OrderedDict([(f"{self.source_lang}-{self.target_lang}", parallel_dataset)]) if use_noiser: if getattr(self.args, "denoising_source_parallel", False): dataset_map[( f"{self.source_lang}-{self.source_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.source_noiser, ), tgt=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_parallel", False): dataset_map[( f"{self.target_lang}-{self.target_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=tgt_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.target_noiser, ), tgt=tgt_dataset, src_sizes=tgt_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_source_mono", False): source_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_source_binary_path) dataset_map[( f"{self.source_lang}-{self.source_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=source_mono_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.source_noiser, ), tgt=source_mono_dataset, src_sizes=source_mono_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_mono", False): target_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_target_binary_path) dataset_map[( f"{self.target_lang}-{self.target_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=target_mono_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.target_noiser, ), tgt=target_mono_dataset, src_sizes=target_mono_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) self.datasets[split] = RoundRobinZipDatasets(dataset_map) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} datasets")
def load_dataset(self, split, src_bin_path, tgt_bin_path, seed=None, noiser=None): """ Load a dataset split. Seed and noiser are only used for loading train data, not eval data. """ corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.target_lang, data_file=tgt_bin_path), weights_file=None, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) dataset_map = OrderedDict([(f"{self.source_lang}-{self.target_lang}", parallel_dataset)]) if noiser is not None: if getattr(self.args, "denoising_source_parallel", False): dataset_map[( f"{self.source_lang}-{self.source_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.noiser, ), tgt=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_parallel", False): dataset_map[( f"{self.target_lang}-{self.target_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=tgt_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.noiser, ), tgt=tgt_dataset, src_sizes=tgt_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_source_mono", False): source_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_source_binary_path) dataset_map[( f"{self.source_lang}-{self.source_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=source_mono_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.noiser, ), tgt=source_mono_dataset, src_sizes=source_mono_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_mono", False): target_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_target_binary_path) dataset_map[( f"{self.target_lang}-{self.target_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=target_mono_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.noiser, ), tgt=target_mono_dataset, src_sizes=target_mono_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) self.datasets[split] = RoundRobinZipDatasets(dataset_map) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} datasets")