Ejemplo n.º 1
0
def binarize_text_file(
    text_file: str,
    dictionary: Dictionary,
    output_path: str,
    append_eos: bool,
    reverse_order: bool,
    use_char_data: bool = False,
    char_dictionary: Optional[Dictionary] = None,
    already_numberized: bool = False,
) -> str:
    output_path = maybe_generate_temp_file_path(output_path)
    if use_char_data:
        dataset = char_data.InMemoryNumpyWordCharDataset()
        dataset.parse(
            path=text_file,
            word_dict=dictionary,
            char_dict=char_dictionary,
            reverse_order=reverse_order,
            append_eos=append_eos,
        )
    else:
        dataset = pytorch_translate_data.InMemoryNumpyDataset()
        dataset.parse(
            path=text_file,
            dictionary=dictionary,
            reverse_order=reverse_order,
            append_eos=append_eos,
            already_numberized=already_numberized,
        )
    dataset.save(output_path)
    return output_path
Ejemplo n.º 2
0
def binarize_text_file_multilingual(
    corpus_configs: List[pytorch_translate_data.MultilingualCorpusConfig],
    output_path: str,
    append_eos: bool,
    reverse_order: bool,
    prepend_language_id: bool,
    use_char_data: bool = False,
    embed_bytes: bool = False,
    already_numberized: bool = False,
) -> str:
    output_path = maybe_generate_temp_file_path(output_path)
    if use_char_data:
        dataset = char_data.InMemoryNumpyWordCharDataset()
        dataset.parse_multilingual(
            corpus_configs,
            reverse_order=reverse_order,
            append_eos=append_eos,
            embed_bytes=embed_bytes,
            prepend_language_id=prepend_language_id,
            already_numberized=already_numberized,
        )
    else:
        dataset = pytorch_translate_data.InMemoryNumpyDataset()
        dataset.parse_multilingual(
            corpus_configs,
            append_eos=append_eos,
            reverse_order=reverse_order,
            prepend_language_id=prepend_language_id,
            already_numberized=already_numberized,
        )
    dataset.save(output_path)
    return output_path
Ejemplo n.º 3
0
def binarize_text_file(
    text_file: str,
    dictionary: Dictionary,
    output_path: str,
    append_eos: bool,
    reverse_order: bool,
    use_char_data: bool = False,
    char_dictionary: Optional[Dictionary] = None,
) -> str:
    if not output_path:
        fd, output_path = tempfile.mkstemp()
        # We only need the file name.
        os.close(fd)

    # numpy silently appends this suffix if it is not present, so this ensures
    # that the correct path is returned
    if not output_path.endswith(".npz"):
        output_path += ".npz"

    if use_char_data:
        dataset = char_data.InMemoryNumpyWordCharDataset()
        dataset.parse(
            path=text_file,
            word_dict=dictionary,
            char_dict=char_dictionary,
            reverse_order=reverse_order,
            append_eos=append_eos,
        )
    else:
        dataset = pytorch_translate_data.InMemoryNumpyDataset()
        dataset.parse(text_file, dictionary, reverse_order, append_eos)
    dataset.save(output_path)

    return output_path
    def load_dataset_from_text(
        self,
        split: str,
        source_text_file: str,
        target_text_file: str,
        append_eos: Optional[bool] = False,
        reverse_source: Optional[bool] = True,
    ):
        dst_dataset = data.IndexedRawTextDataset(
            path=target_text_file,
            dictionary=self.target_dictionary,
            # We always append EOS to the target sentence since we still want
            # the model to output an indication the sentence has finished, even
            # if we don't append the EOS symbol to the source sentence
            # (to prevent the model from misaligning UNKs or other words
            # to the frequently occurring EOS).
            append_eos=True,
            # We don't reverse the order of the target sentence, since
            # even if the source sentence is fed to the model backwards,
            # we still want the model to start outputting from the first word.
            reverse_order=False,
        )

        if self.char_source_dict is not None:
            src_dataset = char_data.InMemoryNumpyWordCharDataset()
            src_dataset.parse(
                path=source_text_file,
                word_dict=self.source_dictionary,
                char_dict=self.char_source_dict,
                reverse_order=reverse_source,
                append_eos=append_eos,
            )
            self.datasets[split] = char_data.LanguagePairSourceCharDataset(
                src_dataset,
                src_dataset.sizes,
                self.source_dictionary,
                dst_dataset,
                dst_dataset.sizes,
                self.target_dictionary,
            )
        else:
            src_dataset = data.IndexedRawTextDataset(
                path=source_text_file,
                dictionary=self.source_dictionary,
                append_eos=append_eos,
                reverse_order=reverse_source,
            )
            self.datasets[split] = data.LanguagePairDataset(
                src_dataset,
                src_dataset.sizes,
                self.source_dictionary,
                dst_dataset,
                dst_dataset.sizes,
                self.target_dictionary,
                left_pad_source=False,
            )

        print(f"| {split} {len(self.datasets[split])} examples")
Ejemplo n.º 5
0
def make_language_pair_dataset_from_text(
    source_text_file: str,
    target_text_file: str,
    source_dict: pytorch_translate_dictionary.Dictionary,
    target_dict: pytorch_translate_dictionary.Dictionary,
    append_eos: Optional[bool] = False,
    reverse_source: Optional[bool] = True,
    char_source_dict: Optional[pytorch_translate_dictionary.Dictionary] = None,
) -> data.LanguagePairDataset:
    dst_dataset = indexed_dataset.IndexedRawTextDataset(
        path=target_text_file,
        dictionary=target_dict,
        # We always append EOS to the target sentence since we still want
        # the model to output an indication the sentence has finished, even
        # if we don't append the EOS symbol to the source sentence
        # (to prevent the model from misaligning UNKs or other words
        # to the frequently occurring EOS).
        append_eos=True,
        # We don't reverse the order of the target sentence, since
        # even if the source sentence is fed to the model backwards,
        # we still want the model to start outputting from the first word.
        reverse_order=False,
    )

    if char_source_dict is not None:
        src_dataset = char_data.InMemoryNumpyWordCharDataset()
        src_dataset.parse(
            path=source_text_file,
            word_dict=source_dict,
            char_dict=char_source_dict,
            reverse_order=reverse_source,
            append_eos=append_eos,
        )
        return char_data.LanguagePairSourceCharDataset(
            src=src_dataset,
            dst=dst_dataset,
            pad_idx=source_dict.pad(),
            eos_idx=source_dict.eos(),
        )
    else:
        return data.LanguagePairDataset(
            src=indexed_dataset.IndexedRawTextDataset(
                path=source_text_file,
                dictionary=source_dict,
                append_eos=append_eos,
                reverse_order=reverse_source,
            ),
            dst=dst_dataset,
            pad_idx=source_dict.pad(),
            eos_idx=source_dict.eos(),
        )