def __init__(self, tgt_dataset, src_dict, tgt_dict=None, backtranslation_fn=None, output_collater=None, backward_output_collater=None, cuda=True, noising=False, **kwargs): self.tgt_dataset = tgt_dataset self.backtranslation_fn = backtranslation_fn self.output_collater = output_collater if output_collater is not None \ else tgt_dataset.collater self.backward_output_collater = backward_output_collater if backward_output_collater is not None \ else self.output_collater self.cuda = cuda if torch.cuda.is_available() else False self.src_dict = src_dict self.tgt_dict = tgt_dict if noising: self.noising = UnsupervisedMTNoising(self.src_dict, max_word_shuffle_distance=0, word_dropout_prob=0.1, word_blanking_prob=0.1, bpe_cont_marker="▁") #self.noising = UnsupervisedMTNoising(self.src_dict, max_word_shuffle_distance=5, word_dropout_prob=0.2, word_blanking_prob=0.2, bpe_cont_marker="▁") else: self.noising = None
def test_load_data_noising(self): test_args = test_utils.ModelParamsDict() test_args.source_lang = "en" test_args.target_lang = "fr" test_args.log_verbose = False src_dict, tgt_dict = test_utils.create_vocab_dictionaries() num_paths = 4 src_bin_path, tgt_bin_path = {}, {} for i in range(num_paths): src_text_file, tgt_text_file = test_utils.create_test_text_files() src_bin_path[i] = preprocess.binarize_text_file( text_file=src_text_file, dictionary=src_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) tgt_bin_path[i] = preprocess.binarize_text_file( text_file=tgt_text_file, dictionary=tgt_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict) split = "1" task.load_dataset( split, src_bin_path, tgt_bin_path, noiser={ 0: UnsupervisedMTNoising( dictionary=src_dict, max_word_shuffle_distance=3, word_dropout_prob=0.2, word_blanking_prob=0.2, ) }, ) self.assertEqual(len(task.datasets[split]), 16) self.assertIsInstance(task.datasets[split].datasets[0].src, NoisingDataset)
def _load_dataset_multi_path( self, split: str, src_bin_path: str, tgt_bin_path: str, is_npz: bool = True ): assert type(tgt_bin_path) is not str assert set(src_bin_path.keys()) == set(tgt_bin_path.keys()) source_lang = self.args.source_lang or "src" target_lang = self.args.target_lang or "tgt" direction = source_lang + "-" + target_lang dataset_upsampling = ( pytorch_translate_utils.maybe_parse_collection_argument( self.args.dataset_upsampling )[direction] if self.args.dataset_upsampling else None ) dataset_relative_ratio = ( pytorch_translate_utils.maybe_parse_collection_argument( self.args.dataset_relative_ratio )[direction] if self.args.dataset_relative_ratio else None ) noiser = {} noise_options = [ "word_dropout_prob", "max_word_shuffle_distance", "word_blanking_prob", ] for option in noise_options: option_map = getattr(self.args, option + "_map", None) if option_map: option_map = pytorch_translate_utils.maybe_parse_collection_argument( option_map )[direction] for key in option_map: if key not in noiser: noiser[key] = { noise_option: None for noise_option in noise_options } noiser[key][option] = option_map[key] for key in noiser: noiser[key] = UnsupervisedMTNoising( dictionary=self.src_dict, max_word_shuffle_distance=noiser[key]["max_word_shuffle_distance"] or 0, word_dropout_prob=noiser[key]["word_dropout_prob"] or 0, word_blanking_prob=noiser[key]["word_blanking_prob"] or 0, ) if dataset_relative_ratio is not None: assert dataset_upsampling is None, "dataset_upsampling and " "dataset_relative_ratio couldn't be specified together." assert dataset_relative_ratio[0] in src_bin_path.keys() self._load_dataset_multi_path_helper( split=split, src_multiple_bin_paths=src_bin_path, tgt_multiple_bin_paths=tgt_bin_path, dataset_relative_ratio=dataset_relative_ratio, seed=self.args.seed, noiser=noiser, is_npz=is_npz, ) elif dataset_upsampling is not None: for key in dataset_upsampling.keys(): assert key in src_bin_path.keys() self._load_dataset_multi_path_helper( split=split, src_multiple_bin_paths=src_bin_path, tgt_multiple_bin_paths=tgt_bin_path, dataset_upsampling=dataset_upsampling, seed=self.args.seed, noiser=noiser, is_npz=is_npz, ) else: self._load_dataset_multi_path_helper( split=split, src_multiple_bin_paths=src_bin_path, tgt_multiple_bin_paths=tgt_bin_path, seed=self.args.seed, noiser=noiser, is_npz=is_npz, )
def setup_training_model(args): """Parse args, load dataset, and build model with criterion.""" if not torch.cuda.is_available(): print("Warning: training without CUDA is likely to be slow!") else: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task and load dataset task = tasks.setup_task(args) # Build model and criterion model = task.build_model(args) print("| building criterion") criterion = task.build_criterion(args) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.numel() for p in model.parameters())}") if args.task == constants.SEMI_SUPERVISED_TASK: # TODO(T35638969): hide this inside the task itself, just use self.args task.load_dataset( split=args.train_subset, src_bin_path=args.train_source_binary_path, tgt_bin_path=args.train_target_binary_path, forward_model=task.forward_model, backward_model=task.backward_model, ) elif args.task == "pytorch_translate_denoising_autoencoder": task.load_dataset( split=args.train_subset, src_bin_path=args.train_source_binary_path, tgt_bin_path=args.train_target_binary_path, seed=args.seed, use_noiser=True, ) elif args.task == "dual_learning_task": task.load_dataset(split=args.train_subset, seed=args.seed) elif args.task == "pytorch_translate_knowledge_distillation": task.load_dataset( split=args.train_subset, src_bin_path=args.train_source_binary_path, tgt_bin_path=args.train_target_binary_path, weights_file=getattr(args, "train_weights_path", None), is_train=True, ) else: # Support both single and multi path loading for now dataset_upsampling = getattr(args, "dataset_upsampling", None) dataset_relative_ratio = getattr(args, "dataset_relative_ratio", None) source_lang = getattr(args, "source_lang", "src") target_lang = getattr(args, "target_lang", "tgt") direction = source_lang + "-" + target_lang if dataset_upsampling: dataset_upsampling = pytorch_translate_utils.maybe_parse_collection_argument( dataset_upsampling)[direction] if dataset_relative_ratio: dataset_relative_ratio = pytorch_translate_utils.maybe_parse_collection_argument( dataset_relative_ratio)[direction] noiser = {} noise_options = [ "word_dropout_prob", "max_word_shuffle_distance", "word_blanking_prob", ] for option in noise_options: option_map = getattr(args, option + "_map", None) if option_map: option_map = pytorch_translate_utils.maybe_parse_collection_argument( option_map)[direction] for key in option_map: if key not in noiser: noiser[key] = { noise_option: None for noise_option in noise_options } noiser[key][option] = option_map[key] for key in noiser: noiser[key] = UnsupervisedMTNoising( dictionary=task.src_dict, max_word_shuffle_distance=noiser[key] ["max_word_shuffle_distance"] or 0, word_dropout_prob=noiser[key]["word_dropout_prob"] or 0, word_blanking_prob=noiser[key]["word_blanking_prob"] or 0, ) task.load_dataset( split=args.train_subset, src_bin_path=pytorch_translate_utils. maybe_parse_collection_argument(args.train_source_binary_path), tgt_bin_path=pytorch_translate_utils. maybe_parse_collection_argument(args.train_target_binary_path), weights_file=getattr(args, "train_weights_path", None), dataset_upsampling=dataset_upsampling, dataset_relative_ratio=dataset_relative_ratio, seed=args.seed, noiser=noiser, ) if args.task == "dual_learning_task": task.load_dataset(split=args.valid_subset, seed=args.seed) else: task.load_dataset( split=args.valid_subset, src_bin_path=args.eval_source_binary_path, tgt_bin_path=args.eval_target_binary_path, ) return task, model, criterion