def binarize_text_file( text_file: str, dictionary: Dictionary, output_path: str, append_eos: bool, reverse_order: bool, use_char_data: bool = False, char_dictionary: Optional[Dictionary] = None, already_numberized: bool = False, ) -> str: output_path = maybe_generate_temp_file_path(output_path) if use_char_data: dataset = char_data.InMemoryNumpyWordCharDataset() dataset.parse( path=text_file, word_dict=dictionary, char_dict=char_dictionary, reverse_order=reverse_order, append_eos=append_eos, ) else: dataset = pytorch_translate_data.InMemoryNumpyDataset() dataset.parse( path=text_file, dictionary=dictionary, reverse_order=reverse_order, append_eos=append_eos, already_numberized=already_numberized, ) dataset.save(output_path) return output_path
def binarize_text_file_multilingual( corpus_configs: List[pytorch_translate_data.MultilingualCorpusConfig], output_path: str, append_eos: bool, reverse_order: bool, prepend_language_id: bool, use_char_data: bool = False, embed_bytes: bool = False, already_numberized: bool = False, ) -> str: output_path = maybe_generate_temp_file_path(output_path) if use_char_data: dataset = char_data.InMemoryNumpyWordCharDataset() dataset.parse_multilingual( corpus_configs, reverse_order=reverse_order, append_eos=append_eos, embed_bytes=embed_bytes, prepend_language_id=prepend_language_id, already_numberized=already_numberized, ) else: dataset = pytorch_translate_data.InMemoryNumpyDataset() dataset.parse_multilingual( corpus_configs, append_eos=append_eos, reverse_order=reverse_order, prepend_language_id=prepend_language_id, already_numberized=already_numberized, ) dataset.save(output_path) return output_path
def binarize_text_file( text_file: str, dictionary: Dictionary, output_path: str, append_eos: bool, reverse_order: bool, use_char_data: bool = False, char_dictionary: Optional[Dictionary] = None, ) -> str: if not output_path: fd, output_path = tempfile.mkstemp() # We only need the file name. os.close(fd) # numpy silently appends this suffix if it is not present, so this ensures # that the correct path is returned if not output_path.endswith(".npz"): output_path += ".npz" if use_char_data: dataset = char_data.InMemoryNumpyWordCharDataset() dataset.parse( path=text_file, word_dict=dictionary, char_dict=char_dictionary, reverse_order=reverse_order, append_eos=append_eos, ) else: dataset = pytorch_translate_data.InMemoryNumpyDataset() dataset.parse(text_file, dictionary, reverse_order, append_eos) dataset.save(output_path) return output_path
def load_dataset_from_text( self, split: str, source_text_file: str, target_text_file: str, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ): dst_dataset = data.IndexedRawTextDataset( path=target_text_file, dictionary=self.target_dictionary, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) if self.char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset() src_dataset.parse( path=source_text_file, word_dict=self.source_dictionary, char_dict=self.char_source_dict, reverse_order=reverse_source, append_eos=append_eos, ) self.datasets[split] = char_data.LanguagePairSourceCharDataset( src_dataset, src_dataset.sizes, self.source_dictionary, dst_dataset, dst_dataset.sizes, self.target_dictionary, ) else: src_dataset = data.IndexedRawTextDataset( path=source_text_file, dictionary=self.source_dictionary, append_eos=append_eos, reverse_order=reverse_source, ) self.datasets[split] = data.LanguagePairDataset( src_dataset, src_dataset.sizes, self.source_dictionary, dst_dataset, dst_dataset.sizes, self.target_dictionary, left_pad_source=False, ) print(f"| {split} {len(self.datasets[split])} examples")
def make_language_pair_dataset_from_text( source_text_file: str, target_text_file: str, source_dict: pytorch_translate_dictionary.Dictionary, target_dict: pytorch_translate_dictionary.Dictionary, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, char_source_dict: Optional[pytorch_translate_dictionary.Dictionary] = None, ) -> data.LanguagePairDataset: dst_dataset = indexed_dataset.IndexedRawTextDataset( path=target_text_file, dictionary=target_dict, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ) if char_source_dict is not None: src_dataset = char_data.InMemoryNumpyWordCharDataset() src_dataset.parse( path=source_text_file, word_dict=source_dict, char_dict=char_source_dict, reverse_order=reverse_source, append_eos=append_eos, ) return char_data.LanguagePairSourceCharDataset( src=src_dataset, dst=dst_dataset, pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), ) else: return data.LanguagePairDataset( src=indexed_dataset.IndexedRawTextDataset( path=source_text_file, dictionary=source_dict, append_eos=append_eos, reverse_order=reverse_source, ), dst=dst_dataset, pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), )