def load_dataset_from_text(
        self,
        split: str,
        source_text_file: str,
        target_text_file: str,
        append_eos: Optional[bool] = False,
        reverse_source: Optional[bool] = True,
    ):
        dst_dataset = data.IndexedRawTextDataset(
            path=target_text_file,
            dictionary=self.target_dictionary,
            # We always append EOS to the target sentence since we still want
            # the model to output an indication the sentence has finished, even
            # if we don't append the EOS symbol to the source sentence
            # (to prevent the model from misaligning UNKs or other words
            # to the frequently occurring EOS).
            append_eos=True,
            # We don't reverse the order of the target sentence, since
            # even if the source sentence is fed to the model backwards,
            # we still want the model to start outputting from the first word.
            reverse_order=False,
        )

        if self.char_source_dict is not None:
            src_dataset = char_data.InMemoryNumpyWordCharDataset()
            src_dataset.parse(
                path=source_text_file,
                word_dict=self.source_dictionary,
                char_dict=self.char_source_dict,
                reverse_order=reverse_source,
                append_eos=append_eos,
            )
            self.datasets[split] = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
            )
        else:
            src_dataset = data.IndexedRawTextDataset(
                path=source_text_file,
                dictionary=self.source_dictionary,
                append_eos=append_eos,
                reverse_order=reverse_source,
            )
            self.datasets[split] = data.LanguagePairDataset(
                src=src_dataset,
                src_sizes=src_dataset.sizes,
                src_dict=self.source_dictionary,
                tgt=dst_dataset,
                tgt_sizes=dst_dataset.sizes,
                tgt_dict=self.target_dictionary,
                left_pad_source=False,
            )

        print(f"| {split} {len(self.datasets[split])} examples")
Esempio n. 2
0
 def load_multisource_dataset_from_text(
     self,
     split: str,
     source_text_files: List[str],
     target_text_file: str,
     append_eos: Optional[bool] = False,
     reverse_source: Optional[bool] = True,
 ):
     src_dataset = multisource_data.IndexedRawTextMultisentDataset(
         path=source_text_files,
         dictionary=self.source_dictionary,
         append_eos=append_eos,
         reverse_order=reverse_source,
     )
     dst_dataset = data.IndexedRawTextDataset(
         path=target_text_file,
         dictionary=self.target_dictionary,
         # We always append EOS to the target sentence since we still want
         # the model to output an indication the sentence has finished, even
         # if we don't append the EOS symbol to the source sentence
         # (to prevent the model from misaligning UNKs or other words
         # to the frequently occurring EOS).
         append_eos=True,
         # We don't reverse the order of the target sentence, since
         # even if the source sentence is fed to the model backwards,
         # we still want the model to start outputting from the first word.
         reverse_order=False,
     )
     self.datasets[split] = multisource_data.MultisourceLanguagePairDataset(
         src_dataset, src_dataset.sizes, self.source_dictionary,
         dst_dataset, dst_dataset.sizes, self.target_dictionary,
     )