Esempio n. 1
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, '--path required for generation!'

    print(args)

    if args.source_lang is None:
        args.source_lang = 'src'
    if args.target_lang is None:
        args.target_lang = 'tgt'

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file,
    )
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file,
    )
    dataset = data.LanguageDatasets(
        src=args.source_lang,
        dst=args.target_lang,
        src_dict=src_dict,
        dst_dict=dst_dict,
    )
    models, model_args = utils.load_ensemble_for_inference(
        args.path,
        dataset.src_dict,
        dataset.dst_dict,
    )
    dataset.splits[args.gen_subset] = pytorch_translate_data.make_language_pair_dataset(
        source_file=args.source_text_file,
        target_file=args.target_text_file,
        source_dict=src_dict,
        target_dict=dst_dict,
        append_eos=model_args.append_eos_to_source,
        reverse_source=model_args.reverse_source,
    )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f'| [{dataset.src}] dictionary: {dataset.src_dict} types')
    print(f'| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types')
    print(f'| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples')
    scorer, num_sentences, gen_timer = _generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset,
    )
    print(f'| Translated {num_sentences} sentences ({gen_timer.n} tokens) '
          f'in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)')
    print(f'| Generate {args.gen_subset} with beam={args.beam}: '
          f'{scorer.result_string()}')
    return scorer.score()
Esempio n. 2
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, "--path required for generation!"

    print(args)

    if args.source_lang is None:
        args.source_lang = "src"
    if args.target_lang is None:
        args.target_lang = "tgt"

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)
    dataset = data.LanguageDatasets(src=args.source_lang,
                                    dst=args.target_lang,
                                    src_dict=src_dict,
                                    dst_dict=dst_dict)
    models, model_args = load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict)
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)
    dataset.splits[
        args.gen_subset] = pytorch_translate_data.make_language_pair_dataset(
            source_file=args.source_text_file,
            target_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(
        f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    scorer, num_sentences, gen_timer = _generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset)
    print(f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
          f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)")
    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
Esempio n. 3
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, '--path required for generation!'

    print(args)

    if args.source_lang is None:
        args.source_lang = 'src'
    if args.target_lang is None:
        args.target_lang = 'tgt'

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file, )
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file, )
    dataset = data.LanguageDatasets(
        src=args.source_lang,
        dst=args.target_lang,
        src_dict=src_dict,
        dst_dict=dst_dict,
    )
    dataset.splits[
        args.gen_subset] = pytorch_translate_data.make_language_pair_dataset(
            source_file=args.source_text_file,
            target_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            args=args,
        )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    print(
        '| {} {} examples'.format(args.gen_subset,
                                  len(dataset.splits[args.gen_subset])), )

    scorer, num_sentences, gen_timer = generate_score(
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset,
    )
    print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.
          format(num_sentences, gen_timer.n, gen_timer.sum,
                 1. / gen_timer.avg))
    print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam,
                                                  scorer.result_string()))
    return scorer.score()