Exemple #1
0
    def make_binary_dataset(input_prefix, output_prefix, lang, guess):
        print('aaa')
        dict = dictionary.Dictionary.load(
            os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        if not guess:
            ds = indexed_dataset.IndexedDatasetBuilder(
                '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix,
                                            args.source_lang, args.target_lang,
                                            lang))
            input_file = '{}.{}'.format(input_prefix, lang)
        else:
            ds = indexed_dataset.IndexedDatasetBuilder(
                '{}/{}.{}-{}.{}.guess.bin'.format(args.destdir, output_prefix,
                                                  args.source_lang,
                                                  args.target_lang, lang))
            input_file = '{}.{}.guess'.format(input_prefix, lang)

        def consumer(tensor):
            ds.add_item(tensor)

        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix,
                                                args.source_lang,
                                                args.target_lang, lang))
Exemple #2
0
def binarize_text_file(
    text_file: str,
    dictionary: pytorch_translate_dictionary.Dictionary,
    output_prefix: str,
    append_eos: bool,
    reverse_order: bool,
) -> str:
    if not output_prefix:
        fd, output_prefix = tempfile.mkstemp()
        # We only need a unique file name prefix, since the helper functions
        # take care of actually creating the file.
        os.close(fd)

    print(f"Outputting binarized version of {text_file} to "
          f"{indexed_dataset.data_file_path(output_prefix)} and "
          f"{indexed_dataset.index_file_path(output_prefix)}")
    builder = indexed_dataset.IndexedDatasetBuilder(
        indexed_dataset.data_file_path(output_prefix))

    def consumer(tensor):
        builder.add_item(tensor)

    counters = tokenizer.Tokenizer.binarize(
        filename=text_file,
        dict=dictionary,
        consumer=consumer,
        append_eos=append_eos,
        reverse_order=reverse_order,
    )
    print(f"Binarizing {text_file}: {counters['nseq']} sents, "
          f"{counters['ntok']} tokens, "
          f"{100 * counters['nunk'] / counters['ntok']:.3}% replaced by "
          f"{dictionary.unk_word}.")

    builder.finalize(indexed_dataset.index_file_path(output_prefix))

    return output_prefix