def setUp(self): self.tgt_dict, self.w1, self.w2, self.src_tokens, self.src_lengths, self.model = ( test_utils.sequence_generator_setup()) dummy_src_samples = self.src_tokens self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples)
def setUp(self): ( self.tgt_dict, self.w1, self.w2, self.src_tokens, self.src_lengths, self.model, ) = test_utils.sequence_generator_setup() dummy_src_samples = self.src_tokens self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples) self.cuda = torch.cuda.is_available()
def setUp(self): self.tgt_dict, self.w1, self.w2, self.src_tokens, self.src_lengths, self.model = ( test_utils.sequence_generator_setup()) backtranslation_args = argparse.Namespace() """ Same as defaults from fairseq/options.py """ backtranslation_args.backtranslation_unkpen = 0 backtranslation_args.backtranslation_sampling = False backtranslation_args.backtranslation_max_len_a = 0 backtranslation_args.backtranslation_max_len_b = 200 backtranslation_args.backtranslation_beam = 2 self.backtranslation_args = backtranslation_args dummy_src_samples = self.src_tokens self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples)
def _get_noising_dataset_batch( self, src_tokens_no_pad, src_dict, append_eos_to_tgt=False, ): """ Constructs a NoisingDataset and the corresponding ``LanguagePairDataset(NoisingDataset(src), src)``. If *append_eos_to_tgt* is True, wrap the source dataset in :class:`TransformEosDataset` to append EOS to the clean source when using it as the target. """ src_dataset = test_utils.TestDataset(data=src_tokens_no_pad) noising_dataset = noising.NoisingDataset( src_dataset=src_dataset, src_dict=src_dict, seed=1234, max_word_shuffle_distance=3, word_dropout_prob=0.2, word_blanking_prob=0.2, noising_class=noising.UnsupervisedMTNoising, ) tgt = src_dataset language_pair_dataset = LanguagePairDataset(src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict) language_pair_dataset = TransformEosDataset( language_pair_dataset, src_dict.eos(), append_eos_to_tgt=append_eos_to_tgt, ) dataloader = torch.utils.data.DataLoader( dataset=language_pair_dataset, batch_size=2, collate_fn=language_pair_dataset.collater, ) denoising_batch_result = next(iter(dataloader)) return denoising_batch_result
def test_eval_dataloader(self): dictionary = test_utils.dummy_dictionary(10) assert len(dictionary) == 14 # 4 extra special symbols assert dictionary.pad() == 1 dataset = test_utils.TestDataset([ torch.tensor([4, 5, 6, 7], dtype=torch.long), torch.tensor([8, 9, 10, 11], dtype=torch.long), torch.tensor([12, 13], dtype=torch.long), ]) dataset = MonolingualDataset(dataset, sizes=[4, 4, 2], src_vocab=dictionary) config = LanguageModelingConfig(tokens_per_sample=4) task = LanguageModelingTask(config, dictionary) eval_dataloader = task.eval_lm_dataloader( dataset=dataset, batch_size=1, context_window=2, num_workers=0, ) batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [ 4, 5, 6, 7, 1, 1 ] assert batch["target"][0].tolist() == [4, 5, 6, 7, 1, 1] batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [ 6, 7, 8, 9, 10, 11 ] assert batch["target"][0].tolist() == [1, 1, 8, 9, 10, 11] batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [10, 11, 12, 13] assert batch["target"][0].tolist() == [1, 1, 12, 13]
def _get_noising_dataset_batch(self, src_tokens_no_pad, src_dict, use_append_eos_dataset=False): """ Constructs a NoisingDataset and the corresponding LanguagePairDataset(NoisingDataset(src), src). If we set use_append_eos_dataset to True, wrap the source dataset in AppendEosDataset to append EOS to the clean source when using it as the target. In practice, we should use AppendEosDataset because our models usually have source without EOS but target with EOS. """ src_dataset = test_utils.TestDataset(data=src_tokens_no_pad) noising_dataset = noising.NoisingDataset( src_dataset=src_dataset, src_dict=src_dict, seed=1234, max_word_shuffle_distance=3, word_dropout_prob=0.2, word_blanking_prob=0.2, noising_class=noising.UnsupervisedMTNoising, ) tgt = src_dataset if use_append_eos_dataset: tgt = AppendEosDataset(src_dataset, src_dict.eos()) language_pair_dataset = LanguagePairDataset(src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict) dataloader = torch.utils.data.DataLoader( dataset=language_pair_dataset, batch_size=2, collate_fn=language_pair_dataset.collater, ) denoising_batch_result = next(iter(dataloader)) return denoising_batch_result
def _build_dataset(self, data, **kwargs): sizes = [len(x) for x in data] underlying_ds = test_utils.TestDataset(data) return TokenBlockDataset(underlying_ds, sizes, **kwargs)