def load_dataset_from_text( self, split: str, source_text_file: str, target_text_file: str, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ): dst_dataset = data.IndexedRawTextDataset( path=target_text_file, dictionary=self.target_dictionary, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset() src_dataset.parse( path=source_text_file, word_dict=self.source_dictionary, char_dict=self.char_source_dict, reverse_order=reverse_source, append_eos=append_eos, ) self.datasets[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, ) else: src_dataset = data.IndexedRawTextDataset( path=source_text_file, dictionary=self.source_dictionary, append_eos=append_eos, reverse_order=reverse_source, ) self.datasets[split] = data.LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) print(f"| {split} {len(self.datasets[split])} examples")
def _load_dataset_single_path( self, split: str, src_bin_path: str, tgt_bin_path: str, weights_file=None, is_npz=True, ): corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.args.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.args.target_lang, data_file=tgt_bin_path), weights_file=weights_file, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split, is_npz=is_npz) dst_dataset = pytorch_translate_data.InMemoryIndexedDataset.create_from_file( corpus.target.data_file, is_npz=is_npz) if getattr(self.args, "reverse_target", None): dst_dataset.reverse() weights_dataset = None if corpus.weights_file and os.path.exists(corpus.weights_file): weights_dataset = weighted_data.IndexedWeightsDataset( corpus.weights_file) assert len(dst_dataset) == len(weights_dataset) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) self.datasets[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, weights=weights_dataset, ) else: src_dataset = pytorch_translate_data.InMemoryIndexedDataset.create_from_file( corpus.source.data_file, is_npz=is_npz) self.datasets[split] = LanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, )
def load_dataset(self, split, src_bin_path, tgt_bin_path, weights_file=None): corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=self.args.source_lang, data_file=src_bin_path), target=pytorch_translate_data.CorpusConfig( dialect=self.args.target_lang, data_file=tgt_bin_path), weights_file=weights_file, ) if self.args.log_verbose: print("Starting to load binarized data files.", flush=True) data_utils.validate_corpus_exists(corpus=corpus, split=split) dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.target.data_file) weights_dataset = None if corpus.weights_file and os.path.exists(corpus.weights_file): weights_dataset = weighted_data.IndexedWeightsDataset( corpus.weights_file) assert len(dst_dataset) == len(weights_dataset) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( corpus.source.data_file) self.datasets[split] = char_data.LanguagePairSourceCharDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, weights=weights_dataset, ) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( corpus.source.data_file) self.datasets[split] = weighted_data.WeightedLanguagePairDataset( src=src_dataset, src_sizes=src_dataset.sizes, src_dict=self.source_dictionary, tgt=dst_dataset, tgt_sizes=dst_dataset.sizes, tgt_dict=self.target_dictionary, weights=weights_dataset, left_pad_source=False, ) if self.args.log_verbose: print("Finished loading dataset", flush=True) print(f"| {split} {len(self.datasets[split])} examples")
def _load_dataset_multi_path_helper( self, split: str, src_multiple_bin_paths: Dict[str, str], tgt_multiple_bin_paths: Dict[str, str], dataset_upsampling: Optional[Dict[str, float]] = None, dataset_relative_ratio: Optional[Tuple[str, float]] = None, seed: Optional[int] = None, noiser: Optional[Dict[str, UnsupervisedMTNoising]] = None, ): corpora_map = pytorch_translate_data.ParallelCorporaMapConfig( src_files=src_multiple_bin_paths, tgt_files=tgt_multiple_bin_paths) datasets = OrderedDict() for key in corpora_map.src_files: src, tgt = corpora_map.src_files[key], corpora_map.tgt_files[key] tgt_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( tgt) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( src) else: src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file( src) src_sizes = src_dataset.sizes if noiser is not None and key in noiser: src_dataset = NoisingDataset( src_dataset=src_dataset, src_dict=self.source_dictionary, seed=seed, noiser=noiser[key], ) if self.char_source_dict is not None: datasets[key] = char_data.LanguagePairSourceCharDataset( src=src_dataset, src_sizes=src_sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, ) else: datasets[key] = LanguagePairDataset( src=src_dataset, src_sizes=src_sizes, src_dict=self.source_dictionary, tgt=tgt_dataset, tgt_sizes=tgt_dataset.sizes, tgt_dict=self.target_dictionary, left_pad_source=False, ) total_line_count = sum(len(datasets[key]) for key in datasets) if dataset_relative_ratio: ds, ratio = dataset_relative_ratio line_count = len(datasets[ds]) # By definition ratio = u * line_count / sum(#lines of other datasets) u = (total_line_count - line_count) / line_count * ratio dataset_upsampling = {key: u} elif not dataset_upsampling: dataset_upsampling = {} print(f"|dataset upsampling:{dataset_upsampling}") ds_list = [] sample_ratios = [] for key, val in datasets.items(): ds_list.append(val) sample_ratios.append(int(dataset_upsampling.get(key, 1))) self.datasets[split] = LanguagePairUpsamplingDataset( datasets=datasets.values(), sample_ratios=sample_ratios)