def load_dataset(self, split, src_bin_path, tgt_bin_path, weights_file=None): corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.args.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.args.target_lang, data_file=tgt_bin_path), weights_file=weights_file, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) if not os.path.exists(corpus.source.data_file): raise ValueError( f"{corpus.source.data_file} for {split} not found!") if not os.path.exists(corpus.target.data_file): raise ValueError( f"{corpus.target.data_file} for {split} not found!") dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) weights_dataset = None if corpus.weights_file and os.path.exists(corpus.weights_file): weights_dataset = weighted_data.IndexedWeightsDataset( corpus.weights_file) assert len(dst_dataset) == len(weights_dataset) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) self.datasets[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, weights=weights_dataset, ) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) self.datasets[split] = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, weights=weights_dataset, ) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} examples")
def load_dataset(self, split, src_bin_path, tgt_bin_path, weights_file=None, is_train=False): """ Currently this method does not support character models. """ corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.args.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.args.target_lang, data_file=tgt_bin_path), weights_file=weights_file, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) if is_train: self.datasets[split] = TeacherDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.src_dict, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.tgt_dict, teacher_models=self.teacher_models, top_k_teacher_tokens=self.top_k_teacher_tokens, top_k_teacher_scores=self.top_k_teacher_scores, top_k_teacher_indices=self.top_k_teacher_indices, left_pad_source=False, ) else: self.datasets[split] = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.src_dict, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.tgt_dict, weights=None, left_pad_source=False, ) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} examples")
def load_parallel_dataset( source_lang, target_lang, src_bin_path, tgt_bin_path, source_dictionary, target_dictionary, split, remove_eos_from_source, append_eos_to_target=True, char_source_dict=None, log_verbose=True, ): corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=source_lang, data_file=src_bin_path ), target=pytorch_translate_data.CorpusConfig( dialect=target_lang, data_file=tgt_bin_path ), weights_file=None, ) if log_verbose: print("Starting to load binarized data files.", flush=True) validate_corpus_exists(corpus=corpus, split=split) tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file ) if char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file ) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file ) parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=target_dictionary, remove_eos_from_source=remove_eos_from_source, append_eos_to_target=append_eos_to_target, ) return parallel_dataset, src_dataset, tgt_dataset
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError("Training on CPU is not supported") torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = [args.train_subset, args.valid_subset] validate_and_set_default_args(args) train_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.train_source_binary_prefix), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.train_target_binary_prefix), ) eval_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.eval_source_binary_prefix), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.eval_target_binary_prefix), ) if args.log_verbose: print("Starting to load binarized data files.", flush=True) dataset = pytorch_translate_data.load_binarized_dataset( train_corpus=train_corpus, eval_corpus=eval_corpus, train_split=args.train_subset, eval_split=args.valid_subset, args=args, ) if args.log_verbose: print("Finished loading dataset", flush=True) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") for split in splits: print(f"| {split} {len(dataset.splits[split])} examples") # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.data.numel() for p in model.parameters())}") # Build trainer trainer = Trainer(args, model, criterion) print(f"| training on {args.distributed_world_size} GPUs") print( f"| max tokens per GPU = {args.max_tokens} and \ max sentences per GPU = {args.max_sentences}", flush=True, ) extra_state = load_existing_checkpoint(args.save_dir, args.restore_file, trainer) return extra_state, trainer, dataset
def load_dataset( self, split, src_bin_path, tgt_bin_path, forward_model=None, backward_model=None ): """Load a dataset split.""" corpus = ptt_data.ParallelCorpusConfig( source=ptt_data.CorpusConfig( dialect=self.source_lang, data_file=src_bin_path ), target=ptt_data.CorpusConfig( dialect=self.target_lang, data_file=tgt_bin_path ), weights_file=None, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) forward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file ) backward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file ) forward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file ) backward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file ) forward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=forward_src_dataset, src_sizes=forward_src_dataset.sizes, src_dict=self.source_dictionary, tgt=forward_tgt_dataset, tgt_sizes=forward_tgt_dataset.sizes, tgt_dict=self.target_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) backward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=backward_src_dataset, src_sizes=backward_src_dataset.sizes, src_dict=self.target_dictionary, tgt=backward_tgt_dataset, tgt_sizes=backward_tgt_dataset.sizes, tgt_dict=self.source_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) dataset_map = OrderedDict( [ (f"{self.source_lang}-{self.target_lang}", forward_parallel_dataset), (f"{self.target_lang}-{self.source_lang}", backward_parallel_dataset), ] ) assert (forward_model and backward_model) or ( forward_model is None and backward_model is None ), ( "Only one of forward or backward models can't be null;" " both have to be non-null or null" ) if forward_model and backward_model: fwd_generator = beam_decode.SequenceGenerator( models=[forward_model], tgt_dict=self.source_dictionary ) bwd_generator = beam_decode.SequenceGenerator( models=[backward_model], tgt_dict=self.target_dictionary ) def monolingual_dataset( path, dictionary, is_source=False, num_examples_limit: Optional[int] = None, ): dataset = self.load_monolingual_dataset( path, is_source=is_source, num_examples_limit=num_examples_limit ) return LanguagePairDataset( src=dataset, src_sizes=dataset.sizes, src_dict=dictionary, tgt=None, tgt_sizes=None, tgt_dict=None, ) monolingual_num_examples_limit = None if self.args.monolingual_ratio is not None: monolingual_num_examples_limit = int( self.args.monolingual_ratio * len(forward_parallel_dataset) ) src_dataset = monolingual_dataset( path=self.args.train_mono_source_binary_path, dictionary=self.source_dictionary, is_source=True, num_examples_limit=monolingual_num_examples_limit, ) tgt_dataset = monolingual_dataset( path=self.args.train_mono_target_binary_path, dictionary=self.target_dictionary, is_source=False, num_examples_limit=monolingual_num_examples_limit, ) dataset_map[ f"{self.source_lang}-" f"{self.target_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}" ] = BacktranslationDataset( tgt_dataset=TransformEosDataset( dataset=tgt_dataset, eos=self.target_dictionary.eos(), # Remove EOS from the input before backtranslation. remove_eos_from_src=True, ), backtranslation_fn=bwd_generator.generate, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, output_collater=TransformEosDataset( dataset=tgt_dataset, eos=self.target_dictionary.eos(), # The original input (now the target) doesn't have # an EOS, so we need to add one. The generated # backtranslation (now the source) will have an EOS, # so we want to remove it. append_eos_to_tgt=True, remove_eos_from_src=True, ).collater, ) dataset_map[ f"{self.target_lang}-" f"{self.source_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}" ] = BacktranslationDataset( tgt_dataset=src_dataset, backtranslation_fn=fwd_generator.generate, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, output_collater=TransformEosDataset( dataset=src_dataset, eos=self.source_dictionary.eos(), # The original input (now the target) doesn't have # an EOS, so we need to add one. The generated # backtranslation (now the source) will have an EOS, # so we want to remove it. append_eos_to_tgt=True, remove_eos_from_src=True, ).collater, ) # print before loading RoundRobinZipDatasets to help catch any bugs for dataset_key, dataset in dataset_map.items(): print(f"| {split}: {dataset_key} {len(dataset)} examples in dataset") self.datasets[split] = RoundRobinZipDatasets(dataset_map) print( f"| {split} {len(self.datasets[split])} examples in RoundRobinZipDatasets" ) if self.args.log_verbose: print("Finished loading dataset", flush=True)
def load_dataset(self, split, src_bin_path, tgt_bin_path, seed=None, noiser=None): """ Load a dataset split. Seed and noiser are only used for loading train data, not eval data. """ corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.target_lang, data_file=tgt_bin_path), weights_file=None, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) dataset_map = OrderedDict([(f"{self.source_lang}-{self.target_lang}", parallel_dataset)]) if noiser is not None: if getattr(self.args, "denoising_source_parallel", False): dataset_map[( f"{self.source_lang}-{self.source_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.noiser, ), tgt=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_parallel", False): dataset_map[( f"{self.target_lang}-{self.target_lang}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=tgt_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.noiser, ), tgt=tgt_dataset, src_sizes=tgt_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_source_mono", False): source_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_source_binary_path) dataset_map[( f"{self.source_lang}-{self.source_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=source_mono_dataset, src_dict=self.source_dictionary, seed=seed, noiser=self.noiser, ), tgt=source_mono_dataset, src_sizes=source_mono_dataset.sizes, src_dict=self.source_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) if getattr(self.args, "denoising_target_mono", False): target_mono_dataset = self.load_monolingual_dataset( self.args.train_mono_target_binary_path) dataset_map[( f"{self.target_lang}-{self.target_lang}_" f"{constants.MONOLINGUAL_DATA_IDENTIFIER}" )] = weighted_data.WeightedLanguagePairDataset( src=noising.NoisingDataset( src_dataset=target_mono_dataset, src_dict=self.target_dictionary, seed=seed, noiser=self.noiser, ), tgt=target_mono_dataset, src_sizes=target_mono_dataset.sizes, src_dict=self.target_dictionary, remove_eos_from_source=not self.args.append_eos_to_source, append_eos_to_target=True, ) self.datasets[split] = RoundRobinZipDatasets(dataset_map) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} datasets")
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError("Training on CPU is not supported") torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = [args.train_subset, args.valid_subset] validate_and_set_default_args(args) train_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.train_source_binary_path), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.train_target_binary_path), weights_file=args.train_weights_path if hasattr( args, "train_weights_path") else None, ) eval_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.eval_source_binary_path), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.eval_target_binary_path), weights_file=None, ) if args.log_verbose: print("Starting to load binarized data files.", flush=True) use_char_source = args.arch == "char_source" dataset = pytorch_translate_data.load_binarized_dataset( train_corpus=train_corpus, eval_corpus=eval_corpus, train_split=args.train_subset, eval_split=args.valid_subset, args=args, use_char_source=use_char_source, ) if args.log_verbose: print("Finished loading dataset", flush=True) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") for split in splits: print(f"| {split} {len(dataset.splits[split])} examples") # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) print("building criterion") criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.numel() for p in model.parameters())}") # Load pretrained model weights if applicable if args.pretrained_weights_file: utils.load_model_state(args.pretrained_weights_file, model, cuda_device=torch.cuda.current_device()) # Build trainer trainer = Trainer(args, model, criterion) print(f"| training on {args.distributed_world_size} GPUs") print( f"| max tokens per GPU = {args.max_tokens} and \ max sentences per GPU = {args.max_sentences}", flush=True, ) os.makedirs(args.save_dir, exist_ok=True) checkpoint_path = os.path.join(args.save_dir, args.restore_file) if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files: print( f"| Restoring individual models from {args.multi_model_restore_files}" ) extra_state = multi_model.import_individual_models( args.multi_model_restore_files, trainer) else: extra_state = load_existing_checkpoint(checkpoint_path, trainer) return extra_state, trainer, dataset
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError("Training on CPU is not supported") torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = [args.train_subset, args.valid_subset] validate_and_set_default_args(args) train_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.train_source_binary_path), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.train_target_binary_path), weights_file=args.train_weights_path if hasattr( args, "train_weights_path") else None, ) eval_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.eval_source_binary_path), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.eval_target_binary_path), weights_file=None, ) if args.log_verbose: print("Starting to load binarized data files.", flush=True) use_char_source = args.arch == "char_source" dataset = pytorch_translate_data.load_binarized_dataset( train_corpus=train_corpus, eval_corpus=eval_corpus, train_split=args.train_subset, eval_split=args.valid_subset, args=args, use_char_source=use_char_source, ) if args.log_verbose: print("Finished loading dataset", flush=True) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") for split in splits: print(f"| {split} {len(dataset.splits[split])} examples") # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) print("building criterion") criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.numel() for p in model.parameters())}") # Build trainer trainer = Trainer(args, model, criterion) print(f"| training on {args.distributed_world_size} GPUs") print( f"| max tokens per GPU = {args.max_tokens} and \ max sentences per GPU = {args.max_sentences}", flush=True, ) os.makedirs(args.save_dir, exist_ok=True) # If --restore-file is already present under --save-dir, use that one # instead of the --restore-file that may be present under # --restore-checkpoint-dir. The idea is that --restore-checkpoint-dir # allows the user to specify restoring from a different run's # checkpoint (possibly with different training params), while not # polluting the previous run's checkpoint directory with new checkpoints. # However, if training gets interrupted and the user restarts training, # we want to resume from the checkpoints under --save-dir, instead of # restarting again from the old run's checkpoint under # --restore-checkpoint-dir. # # Note that if args.restore_file is an absolute path, os.path.join() will # ignore previous directory args and just use the absolute path as is. checkpoint_path = os.path.join(args.save_dir, args.restore_file) if os.path.exists(checkpoint_path): print(f"Using --save-dir={args.save_dir}, " f"--restore-file={args.restore_file}.") elif args.restore_checkpoint_dir: checkpoint_path = os.path.join(args.restore_checkpoint_dir, args.restore_file) print(f"Using --restore-checkpoint-dir={args.restore_checkpoint_dir}, " f"--restore-file={args.restore_file}.") if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files: print( f"| Restoring individual models from {args.multi_model_restore_files}" ) extra_state = multi_model.import_individual_models( args.multi_model_restore_files, trainer) else: extra_state = load_existing_checkpoint( checkpoint_path=checkpoint_path, trainer=trainer, restore_state=args.restore_checkpoint_state, ) return extra_state, trainer, dataset
def load_dataset(self, split, src_bin_path, tgt_bin_path, forward_model=None, backward_model=None): """Load a dataset split.""" corpus = ptt_data.ParallelCorpusConfig( source=ptt_data.CorpusConfig(dialect=self.source_lang, data_file=src_bin_path), target=ptt_data.CorpusConfig(dialect=self.target_lang, data_file=tgt_bin_path), weights_file=None, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) forward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) backward_tgt_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) forward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) backward_src_dataset = ptt_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) forward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=forward_src_dataset, src_sizes=forward_src_dataset.sizes, src_dict=self.source_dictionary, tgt=forward_tgt_dataset, tgt_sizes=forward_tgt_dataset.sizes, tgt_dict=self.target_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) backward_parallel_dataset = weighted_data.WeightedLanguagePairDataset( src=backward_src_dataset, src_sizes=backward_src_dataset.sizes, src_dict=self.target_dictionary, tgt=backward_tgt_dataset, tgt_sizes=backward_tgt_dataset.sizes, tgt_dict=self.source_dictionary, remove_eos_from_source=self.remove_eos_from_source, append_eos_to_target=True, ) dataset_map = OrderedDict([ (f"{self.source_lang}-{self.target_lang}", forward_parallel_dataset), (f"{self.target_lang}-{self.source_lang}", backward_parallel_dataset), ]) assert (forward_model and backward_model) or ( forward_model is None and backward_model is None), ( "Only one of forward or backward models can't be null;" " both have to be non-null or null") if forward_model and backward_model: dataset_map[ f"{self.source_lang}-" f"{self.target_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}"] = BacktranslationDataset( tgt_dataset=self.load_monolingual_dataset( self.args.train_mono_target_binary_path), tgt_dict=self.target_dictionary, backtranslation_model=backward_model, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, remove_eos_at_src=True, generator_class=beam_decode.SequenceGenerator, ) dataset_map[ f"{self.target_lang}-" f"{self.source_lang}_{constants.MONOLINGUAL_DATA_IDENTIFIER}"] = BacktranslationDataset( tgt_dataset=self.load_monolingual_dataset( self.args.train_mono_source_binary_path), tgt_dict=self.source_dictionary, backtranslation_model=forward_model, max_len_a=self.args.max_len_a, max_len_b=self.args.max_len_b, remove_eos_at_src=True, generator_class=beam_decode.SequenceGenerator, ) self.datasets[split] = RoundRobinZipDatasets(dataset_map) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} datasets")
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = [args.train_subset, args.valid_subset] if args.source_lang is None: args.source_lang = 'src' if args.target_lang is None: args.target_lang = 'tgt' assert_corpora_files_specified(args) train_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.train_source_text_file, ), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.train_target_text_file, ), ) eval_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.eval_source_text_file, ), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.eval_target_text_file, ), ) if args.log_verbose: print('Starting to load raw text files.', flush=True) dataset = pytorch_translate_data.load_raw_text_dataset( train_corpus=train_corpus, eval_corpus=eval_corpus, train_split=args.train_subset, eval_split=args.valid_subset, args=args, ) if args.log_verbose: print('Finished loading dataset', flush=True) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format( dataset.src, len(dataset.src_dict), )) print('| [{}] dictionary: {} types'.format( dataset.dst, len(dataset.dst_dict), )) for split in splits: print('| {} {} examples'.format( split, len(dataset.splits[split]), )) # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format( args.arch, criterion.__class__.__name__, )) print('| num. model params: {}'.format( sum(p.data.numel() for p in model.parameters()))) # Build trainer trainer = Trainer(args, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print( '| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, ), flush=True, ) extra_state = load_existing_checkpoint( args.save_dir, args.restore_file, trainer, ) return extra_state, trainer, dataset