コード例 #1
0
 def load_rescore_models(self, args):
     """load rescoring models"""
     models = {}
     if args.l2r_model_path:
         l2r_model, _, l2r_task = pytorch_translate_utils.load_diverse_ensemble_for_inference(
             [args.l2r_model_path]
         )
         models["l2r_model"] = {"model": l2r_model[0], "task": l2r_task}
     #
     if args.r2l_model_path:
         r2l_model, _, r2l_task = pytorch_translate_utils.load_diverse_ensemble_for_inference(
             [args.r2l_model_path]
         )
         models["r2l_model"] = {"model": r2l_model[0], "task": r2l_task}
     #
     if args.reverse_model_path:
         reverse_model, _, reverse_task = pytorch_translate_utils.load_diverse_ensemble_for_inference(
             [args.reverse_model_path]
         )
         models["reverse_model"] = {"model": reverse_model[0], "task": reverse_task}
     #
     if args.lm_model_path:
         lm_model, _, lm_task = pytorch_translate_utils.load_diverse_ensemble_for_inference(
             [args.lm_model_path]
         )
         models["lm_model"] = {"model": lm_model[0], "task": lm_task}
     #
     if args.cloze_transformer_path:
         cloze_model, _, cloze_task = pytorch_translate_utils.load_diverse_ensemble_for_inference(
             [args.cloze_transformer_path]
         )
         models["cloze_model"] = {"model": cloze_model[0], "task": cloze_task}
     return models
コード例 #2
0
ファイル: generate.py プロジェクト: flexpad/translate
def generate(args):
    pytorch_translate_options.print_args(args)

    # Setup task
    task = tasks.setup_task(args)

    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path.split(":"), task)
    args.source_lang = model_args[0].source_lang
    args.target_lang = model_args[0].target_lang

    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)
    if args.source_binary_file != "":
        assert args.target_binary_file != ""
        task.load_dataset(args.gen_subset, args.source_binary_file,
                          args.target_binary_file)
    elif pytorch_translate_data.is_multilingual(args):
        task.set_encoder_langs(model_args[0].multiling_encoder_lang)
        task.set_decoder_langs(model_args[0].multiling_decoder_lang)
        task.load_dataset_from_text_multilingual(
            args.gen_subset,
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_lang_id=task.get_encoder_lang_id(
                args.multiling_source_lang[0]),
            target_lang_id=task.get_decoder_lang_id(
                args.multiling_target_lang[0]),
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    elif args.source_ensembling:
        task.load_multisource_dataset_from_text(
            args.gen_subset,
            source_text_files=args.source_text_file,
            target_text_file=args.target_text_file,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    else:
        task.load_dataset_from_text(
            args.gen_subset,
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    scorer, num_sentences, gen_timer, _ = _generate_score(models=models,
                                                          args=args,
                                                          task=task,
                                                          dataset=task.dataset(
                                                              args.gen_subset))
    print(f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
          f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)")
    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
コード例 #3
0
    def __init__(self, args, task):
        """
        This code is for word-level knowledge distillation. Most of the algorithm
        is inspired from the Kim and Rush (2016) paper:
        http://www.aclweb.org/anthology/D16-1139
        """
        super().__init__(args, task)
        assert (args.teacher_path
                ), "Please specify at least one valid file for --teacher-path"
        use_cuda = torch.cuda.is_available() and not self.args.cpu

        # Load model ensemble from checkpoints
        self.teacher_models, self.teacher_model_args, _ = pytorch_translate_utils.load_diverse_ensemble_for_inference(
            args.teacher_path.split(":"), task)

        # Move models to device and to evaluation mode
        if use_cuda:
            for model in self.teacher_models:
                model.cuda()
        for model in self.teacher_models:
            model.make_generation_fast_(beamable_mm_beam_size=None if args.
                                        no_beamable_mm else args.beam)

        self.kd_weight = getattr(args, "kd_weight", 0)
        if self.kd_weight < 0 or self.kd_weight > 1:
            raise ValueError(
                f"--kd-weight ({self.kd_weight}) must be in [0, 1]")

        self.top_k_teacher_tokens = getattr(args, "top_k_teacher_tokens", 8)
コード例 #4
0
    def __init__(self, args, task):
        super().__init__(args, task)
        assert args.teacher_path, (
            'Please specify at least one valid file for --teacher-path')
        use_cuda = torch.cuda.is_available() and not self.args.cpu

        # Load model ensemble from checkpoints
        self.teacher_models, self.teacher_model_args = (
            pytorch_translate_utils.load_diverse_ensemble_for_inference(
                [args.teacher_path],
                task,
            ))

        # Move models to device and to evaluation mode
        if use_cuda:
            for model in self.teacher_models:
                model.cuda()
        for model in self.teacher_models:
            model.make_generation_fast_(beamable_mm_beam_size=None if args.
                                        no_beamable_mm else args.beam)

        self.kd_weight = getattr(args, 'kd_weight', 0)
        if self.kd_weight < 0 or self.kd_weight > 1:
            raise ValueError(
                f'--kd-weight ({self.kd_weight}) must be in [0, 1]')
コード例 #5
0
def save_top_k(args):
    """
    This function runs forward computation on an ensemble of trained models
    using binarized parallel training data and saves the top-k probabilities
    and their corresponding token indices for each output step.

    Note that the Python binary accepts all generation params, but ignores
    inapplicable ones (such as those related to output length). --max-tokens
    is of particular importance to prevent memory errors.
    """
    pytorch_translate_options.print_args(args)
    use_cuda = torch.cuda.is_available() and not getattr(args, "cpu", False)

    (
        models,
        model_args,
        task,
    ) = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path.split(CHECKPOINT_PATHS_DELIMITER)
    )
    for model in models:
        model.eval()
        if use_cuda:
            model.cuda()

    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(
        a.append_eos_to_source == append_eos_to_source
        and a.reverse_source == reverse_source
        for a in model_args
    )
    assert (
        args.source_binary_file != "" and args.target_binary_file != ""
    ), "collect_top_k_probs requires binarized data."
    task.load_dataset(args.gen_subset, args.source_binary_file, args.target_binary_file)

    assert (
        args.top_k_probs_binary_file != ""
    ), "must specify output file (--top-k-probs-binary-file)!"
    output_path = args.top_k_probs_binary_file

    dataset = task.dataset(args.gen_subset)

    top_k_scores, top_k_indices = compute_top_k(
        task=task,
        models=models,
        dataset=dataset,
        k=args.k_probs_to_collect,
        use_cuda=use_cuda,
        max_tokens=args.teacher_max_tokens,
        max_sentences=args.max_sentences,
        progress_bar_args=args,
    )

    np.savez(output_path, top_k_scores=top_k_scores, top_k_indices=top_k_indices)
    print(
        f"Saved top {top_k_scores.shape[1]} probs for a total of "
        f"{top_k_scores.shape[0]} tokens to file {output_path}"
    )
コード例 #6
0
    def __init__(self, args, model_path=None, model=None, forward_task=None):
        """ Initialize a rescorer model

        Args:
          args: model arguments
          model_path: checkpoint path for rescoring model
        """
        # TODO (T40938917): Allow loading of multiple rescoring models
        # allow to create an empty scorer w/o model
        self.args = args
        self.forward_task = forward_task
        self.task = None
        self.model = None
        # Instantiate the model
        if model is not None:
            self.model = model["model"]
            self.task = model["task"]
        elif model_path:
            rescoring_model, _, task = utils.load_diverse_ensemble_for_inference(
                [model_path])
            self.model = rescoring_model[0]
            self.task = task

        if self.model is not None:
            self.model.eval()
            # Turn off gradient computation in eval mode
            for param in self.model.parameters():
                param.requires_grad = False
            utils.maybe_cuda(self.model)
コード例 #7
0
    def __init__(self, args, src_dict, tgt_dict, char_source_dict=None):
        super().__init__(
            args,
            src_dict=src_dict,
            tgt_dict=tgt_dict,
            char_source_dict=char_source_dict,
        )
        self.top_k_probs_binary_file = args.top_k_probs_binary_file
        self.top_k_teacher_tokens = args.top_k_teacher_tokens

        if self.top_k_probs_binary_file is None:
            # Load model ensemble from checkpoints
            (
                self.teacher_models,
                _,
                _,
            ) = pytorch_translate_utils.load_diverse_ensemble_for_inference(
                args.teacher_path.split(":"))
            if torch.cuda.is_available():
                for teacher_model in self.teacher_models:
                    teacher_model = pytorch_translate_utils.maybe_cuda(
                        teacher_model)
        else:
            self.teacher_models = None

        # Memoized scores for teacher models. By having this and gradually memoizing
        # the values, we prevent the teacher model from keeping recalculating the
        # teacher scores.
        self.top_k_teacher_scores: Dict[int, np.ndarray] = {}
        self.top_k_teacher_indices: Dict[int, np.ndarray] = {}
コード例 #8
0
ファイル: rescoring.py プロジェクト: yazici/translate
def setup_rescoring(args):
    if args.rescoring_strategy is None or args.rescoring_model_path is None:
        return None

    # TODO (T40938917): Allow loading of multiple rescoring models
    rescoring_model, rescoring_model_arg, rescoring_task = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        [args.rescoring_model_path])
    return rescoring_model[0]
コード例 #9
0
ファイル: rescorer.py プロジェクト: Pandinosaurus/translate
def main():
    parser = argparse.ArgumentParser(
        description=("Rescore generated hypotheses with extra models"))
    add_args(parser)
    add_args_rescore(parser)
    args = parser.parse_args()

    assert (args.translation_info_export_path is not None
            ), "--translation_info_export_path is required for rescoring"

    assert args.l2r_model_path is not None, "Rescoring needs forward model"

    _, _, forward_task = utils.load_diverse_ensemble_for_inference(
        [args.l2r_model_path])
    rescorer = Rescorer(args, forward_task)
    dst_dict = forward_task.tgt_dict
    base_bleu_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                                   dst_dict.unk())
    rescoring_bleu_scorer = bleu.Scorer(
        bleu.BleuConfig(
            pad=dst_dict.pad(),
            eos=dst_dict.eos(),
            unk=dst_dict.unk(),
        ))

    with open(args.translation_info_export_path, "rb") as file:
        translation_info_list = pickle.load(file)

    scores_to_export_list = []
    trans_batch_info = []
    for k in tqdm(range(0, len(translation_info_list), args.batch_size)):
        trans_batch_info = translation_info_list[k:k + args.batch_size]
        for j in range(len(trans_batch_info)):
            trans_batch_info[j]["hypos"] = [{
                "score": hypo["score"],
                "tokens": hypo["tokens"].cuda()
            } for hypo in trans_batch_info[j]["hypos"]]
        top_tokens, scores_to_export = find_top_tokens(args, trans_batch_info,
                                                       rescorer,
                                                       dst_dict.pad())
        if args.scores_info_export_path is not None:
            scores_to_export_list += scores_to_export

        for i, trans_info in enumerate(trans_batch_info):
            base_bleu_scorer.add(
                trans_info["target_tokens"].int().cpu(),
                trans_info["hypos"][0]["tokens"].int().cpu(),
            )
            rescoring_bleu_scorer.add(trans_info["target_tokens"].int().cpu(),
                                      top_tokens[i].int().cpu())
        trans_batch_info = []

    print("| Base ", base_bleu_scorer.result_string())
    print("| Rescoring ", rescoring_bleu_scorer.result_string())

    if args.scores_info_export_path is not None:
        with open(args.scores_info_export_path, "wb") as file:
            pickle.dump(scores_to_export_list, file)
コード例 #10
0
ファイル: model_scorers.py プロジェクト: jinfengr/translate
    def __init__(self, args, model_path):
        self.args = args
        # TODO (T40938917): Allow loading of multiple rescoring models
        (
            rescoring_model,
            rescoring_model_arg,
            rescoring_task,
        ) = utils.load_diverse_ensemble_for_inference([model_path])
        self.task = rescoring_task
        self.model = rescoring_model[0]
        self.model.eval()

        if not self.args.cpu:
            utils.maybe_cuda(self.model)
コード例 #11
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, "--path required for generation!"

    print(args)

    src_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file)
    dataset = data.LanguageDatasets(
        src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict
    )
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict
    )
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(
        a.append_eos_to_source == append_eos_to_source
        and a.reverse_source == reverse_source
        for a in model_args
    )
    dataset.splits[args.gen_subset] = pytorch_translate_data.make_language_pair_dataset_from_text(
        source_text_file=args.source_text_file,
        target_text_file=args.target_text_file,
        source_dict=src_dict,
        target_dict=dst_dict,
        append_eos=append_eos_to_source,
        reverse_source=reverse_source,
    )

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    scorer, num_sentences, gen_timer = _generate_score(
        models=models, args=args, dataset=dataset, dataset_split=args.gen_subset
    )
    print(
        f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
        f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)"
    )
    print(
        f"| Generate {args.gen_subset} with beam={args.beam}: "
        f"{scorer.result_string()}"
    )
    return scorer.score()
コード例 #12
0
ファイル: rescorer.py プロジェクト: blufb/translate
def main():
    args = get_arg_parser().parse_args()

    assert (args.translation_info_export_path is not None
            ), "--translation_info_export_path is required for rescoring"

    assert args.l2r_model_path is not None, "Rescoring needs forward model"

    _, _, forward_task = utils.load_diverse_ensemble_for_inference(
        [args.l2r_model_path])
    rescorer = Rescorer(args, forward_task)
    dst_dict = forward_task.tgt_dict
    base_bleu_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                                   dst_dict.unk())
    rescoring_bleu_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                                        dst_dict.unk())

    translation_info_list = pickle.load(
        open(args.translation_info_export_path, "rb"))
    scores_to_export_list = []
    for trans_info in tqdm(translation_info_list):
        trans_info["hypos"] = [{
            "score": hypo["score"],
            "tokens": hypo["tokens"].cuda()
        } for hypo in trans_info["hypos"]]

        base_bleu_scorer.add(
            trans_info["target_tokens"].int().cpu(),
            trans_info["hypos"][0]["tokens"].int().cpu(),
        )

        top_tokens, scores_to_export = find_top_tokens(args, trans_info,
                                                       rescorer)
        if args.scores_info_export_path is not None:
            scores_to_export_list.append(scores_to_export)

        rescoring_bleu_scorer.add(trans_info["target_tokens"].int().cpu(),
                                  top_tokens.int().cpu())

    print("| Base ", base_bleu_scorer.result_string())
    print("| Rescoring ", rescoring_bleu_scorer.result_string())

    if args.scores_info_export_path is not None:
        f = open(args.scores_info_export_path, "wb")
        pickle.dump(scores_to_export_list, f)
        f.close()
コード例 #13
0
    def __init__(self, args, model_path):
        """ Initialize a rescorer model

        Args:
          args: model arguments
          model_path: checkpoint path for rescoring model
        """
        self.args = args
        # TODO (T40938917): Allow loading of multiple rescoring models
        (
            rescoring_model,
            rescoring_model_arg,
            rescoring_task,
        ) = utils.load_diverse_ensemble_for_inference([model_path])
        self.task = rescoring_task  # e.g p(y), p(x|y) etc.
        self.model = rescoring_model[0]
        self.model.eval()

        utils.maybe_cuda(self.model)
コード例 #14
0
ファイル: benchmark.py プロジェクト: YangHaha11514/translate
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, "--path required for generation!"

    print(args)

    # Benchmarking should be language-agnostic
    args.source_lang = "src"
    args.target_lang = "tgt"

    src_dict = pytorch_translate_dictionary.Dictionary.load(
        args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(
        args.target_vocab_file)

    # Generate synthetic raw text files
    source_text_file = generate_synthetic_text(args.source_lang,
                                               src_dict.symbols, args)
    target_text_file = generate_synthetic_text(args.target_lang,
                                               src_dict.symbols, args)

    dataset = data.LanguageDatasets(src=args.source_lang,
                                    dst=args.target_lang,
                                    src_dict=src_dict,
                                    dst_dict=dst_dict)
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict)
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)
    dataset.splits[
        args.
        gen_subset] = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=source_text_file,
            target_text_file=target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    # Remove temporary text files
    os.remove(source_text_file)
    os.remove(target_text_file)

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(
        f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    args.keep_detailed_timing = True
    scorer, num_sentences, gen_timer = pytorch_translate_generate._generate_score(
        models=models,
        args=args,
        dataset=dataset,
        dataset_split=args.gen_subset)

    # Remove contribution of primer sentence
    gen_timer.reset_bucket(0)

    print(
        f"| Translated {num_sentences} sentences ({sum(gen_timer.n)} tokens) "
        f"in {sum(gen_timer.sum):.3f}s ({1. / gen_timer.avg:.2f} tokens/s)")

    for bucket_id in range(gen_timer.n_buckets):
        if gen_timer.n[bucket_id] != 0:
            print(
                "  | Length {}: {} sentences ({} tok) in {:.3f}s ({:.3f} tok/s, avg. latency {:4f}s)"
                .format(
                    bucket_id * args.increment,
                    gen_timer.count[bucket_id],
                    gen_timer.n[bucket_id],
                    gen_timer.sum[bucket_id],
                    1. / gen_timer.avgs[bucket_id],
                    gen_timer.sum[bucket_id] / gen_timer.count[bucket_id],
                ))

    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
コード例 #15
0
ファイル: whitebox.py プロジェクト: yashpatel5400/translate
def setup_attack(args):
    """Load model, data and create the AdversarialTrainer object"""

    # Setup task
    task = tasks.setup_task(args)

    # Load model
    models, models_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path.split(':'),
        task,
    )

    # Only one model is supported as of now
    model, model_args = models[0], models_args[0]

    # Languages
    args.source_lang = model_args.source_lang
    args.target_lang = model_args.target_lang

    # Keep track of whether we reverse the source or not
    # (this is important to save the adversarial inputs in the correct order)
    args.reverse_source = model_args.reverse_source

    # Load dataset
    task.load_dataset_from_text(
        args.gen_subset,
        source_text_file=args.source_text_file,
        target_text_file=args.target_text_file,
        append_eos=model_args.append_eos_to_source,
        reverse_source=model_args.reverse_source,
    )

    # Create adversarial criterion
    adv_criterion = task.build_adversarial_criterion(args)

    # Adversary
    adversary = adversaries.build_adversary(args, model, task)

    # Print a bit of info
    print(f"| model {model_args.arch}, "
          f"adversarial criterion {adv_criterion.__class__.__name__}, "
          f"adversary {adversary.__class__.__name__}")

    # Build trainer
    adv_trainer = adversarial_trainer.AdversarialTrainer(
        args=args,
        task=task,
        model=model,
        criterion=None,
        adversarial_criterion=adv_criterion,
        adversary=adversary)

    # Device infos
    # For now only 1 GPU is supported
    distributed_world_size = getattr(args, "distributed_world_size", 1)
    print(f"| Attacking on {distributed_world_size} GPU(s)")
    print(
        f"| max tokens per GPU = {args.max_tokens} and \
        max sentences per GPU = {args.max_sentences}",
        flush=True,
    )

    return adv_trainer, task
コード例 #16
0
def generate(args):
    pytorch_translate_options.print_args(args)

    src_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file)
    use_char_source = args.char_source_vocab_file != ""
    if use_char_source:
        char_source_dict = pytorch_translate_dictionary.Dictionary.load(
            args.char_source_vocab_file
        )
        # this attribute is used for CharSourceModel construction
        args.char_source_dict_size = len(char_source_dict)
    else:
        char_source_dict = None

    dataset = data.LanguageDatasets(
        src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict
    )
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict
    )
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(
        a.append_eos_to_source == append_eos_to_source
        and a.reverse_source == reverse_source
        for a in model_args
    )
    if args.source_binary_file != "":
        assert args.target_binary_file != ""
        dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            args.target_binary_file
        )
        if use_char_source:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                args.source_binary_file
            )
            gen_split = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=src_dict.pad(),
                eos_idx=dst_dict.eos(),
            )
        else:
            src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                args.source_binary_file
            )
            gen_split = data.LanguagePairDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=src_dict.pad(),
                eos_idx=dst_dict.eos(),
            )
    elif pytorch_translate_data.is_multilingual(args):
        gen_split = pytorch_translate_data.make_language_pair_dataset_from_text_multilingual(
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_lang_id=args.multiling_source_lang_id,
            target_lang_id=args.multiling_target_lang_id,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    elif args.source_ensembling:
        gen_split = multisource_data.make_multisource_language_pair_dataset_from_text(
            source_text_files=args.source_text_file,
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    else:
        gen_split = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
            char_source_dict=char_source_dict,
        )
    dataset.splits[args.gen_subset] = gen_split

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    scorer, num_sentences, gen_timer, _ = _generate_score(
        models=models, args=args, dataset=dataset, dataset_split=args.gen_subset
    )
    print(
        f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
        f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)"
    )
    print(
        f"| Generate {args.gen_subset} with beam={args.beam}: "
        f"{scorer.result_string()}"
    )
    return scorer.score()
コード例 #17
0
def generate(args):
    pytorch_translate_options.print_args(args)

    models, model_args, task = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path.split(CHECKPOINT_PATHS_DELIMITER)
    )
    args.source_lang = model_args[0].source_lang
    args.target_lang = model_args[0].target_lang

    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(
        a.append_eos_to_source == append_eos_to_source
        and a.reverse_source == reverse_source
        for a in model_args
    )
    if args.source_binary_file != "":
        assert args.target_binary_file != ""
        if isinstance(task, PytorchTranslateTask):
            task.load_dataset(
                args.gen_subset,
                args.source_binary_file,
                args.target_binary_file,
                is_npz=args.is_npz,
            )
        else:
            task.load_dataset(
                args.gen_subset, args.source_binary_file, args.target_binary_file
            )
    elif pytorch_translate_data.is_multilingual_many_to_one(args):
        task.set_encoder_langs(model_args[0].multiling_encoder_lang)
        task.set_decoder_langs(model_args[0].multiling_decoder_lang)
        task.load_dataset_from_text_multilingual(
            args.gen_subset,
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_lang_id=task.get_encoder_lang_id(args.multiling_source_lang[0]),
            target_lang_id=task.get_decoder_lang_id(args.multiling_target_lang[0]),
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    elif args.source_ensembling:
        task.load_multisource_dataset_from_text(
            args.gen_subset,
            source_text_files=args.source_text_file,
            target_text_file=args.target_text_file,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    else:
        task.load_dataset_from_text(
            args.gen_subset,
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    lang_pair = None
    if isinstance(task, PyTorchTranslateMultiTask):
        if args.source_lang and args.target_lang:
            lang_pair = args.source_lang + "-" + args.target_lang
        else:
            lang_pair = "src-tgt"
    scorer, num_sentences, gen_timer, _ = generate_score(
        args=args,
        task=task,
        dataset=task.dataset(args.gen_subset),
        lang_pair=lang_pair,
        models=models,
    )
    print(
        f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
        f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)"
    )
    print(
        f"| Generate {args.gen_subset} with beam={args.beam}: "
        f"{scorer.result_string()}"
    )
    return scorer.score()
コード例 #18
0
def generate(args):
    assert_test_corpus_and_vocab_files_specified(args)
    assert args.path is not None, "--path required for generation!"

    print(args)

    # Benchmarking should be language-agnostic
    args.source_lang = "src"
    args.target_lang = "tgt"

    task = tasks.setup_task(args)
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path.split(':'), task)

    # Generate synthetic raw text files
    source_text_file = generate_synthetic_text(args.source_lang,
                                               task.source_dictionary.symbols,
                                               args)
    target_text_file = generate_synthetic_text(args.target_lang,
                                               task.target_dictionary.symbols,
                                               args)

    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)

    task.load_dataset_from_text(
        args.gen_subset,
        source_text_file=source_text_file,
        target_text_file=target_text_file,
        append_eos=append_eos_to_source,
        reverse_source=reverse_source,
    )

    # Remove temporary text files
    os.remove(source_text_file)
    os.remove(target_text_file)

    args.keep_detailed_timing = True
    scorer, num_sentences, gen_timer, _ = pytorch_translate_generate._generate_score(
        models=models, args=args, task=task, dataset_split=args.gen_subset)

    # Remove contribution of primer sentence
    gen_timer.reset_bucket(0)

    print(
        f"| Translated {num_sentences} sentences ({sum(gen_timer.n)} tokens) "
        f"in {sum(gen_timer.sum):.3f}s ({1. / gen_timer.avg:.2f} tokens/s)")

    for bucket_id in range(gen_timer.n_buckets):
        if gen_timer.n[bucket_id] != 0:
            print(
                "  | Length {}: {} sentences ({} tok) in {:.3f}s ({:.3f} tok/s, avg. latency {:4f}s)"
                .format(
                    bucket_id * args.increment,
                    gen_timer.count[bucket_id],
                    gen_timer.n[bucket_id],
                    gen_timer.sum[bucket_id],
                    1. / gen_timer.avgs[bucket_id],
                    gen_timer.sum[bucket_id] / gen_timer.count[bucket_id],
                ))

    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
コード例 #19
0
ファイル: benchmark.py プロジェクト: jamesr66a/translate-1
def benchmark(args):
    assert args.source_vocab_file and os.path.isfile(
        args.source_vocab_file
    ), "Please specify a valid file for --source-vocab-file"
    assert args.target_vocab_file and os.path.isfile(
        args.target_vocab_file
    ), "Please specify a valid file for --target-vocab_file"
    assert args.path is not None, "--path required for generation!"

    print(args)

    # Benchmarking should be language-agnostic
    args.source_lang = "src"
    args.target_lang = "tgt"

    models, model_args, task = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path.split(":")
    )

    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(
        a.append_eos_to_source == append_eos_to_source
        and a.reverse_source == reverse_source
        for a in model_args
    )

    def benchmark_length(n):
        # Generate synthetic raw text files
        source_text_file = generate_synthetic_text(
            dialect=args.source_lang,
            dialect_symbols=task.source_dictionary.symbols,
            length=n,
            examples=args.examples_per_length,
        )
        target_text_file = generate_synthetic_text(
            dialect=args.target_lang,
            dialect_symbols=task.target_dictionary.symbols,
            length=n,
            examples=args.examples_per_length,
        )

        task.load_dataset_from_text(
            args.gen_subset,
            source_text_file=source_text_file,
            target_text_file=target_text_file,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

        # Remove temporary text files
        os.remove(source_text_file)
        os.remove(target_text_file)

        # priming
        scorer, num_sentences, gen_timer, _ = pytorch_translate_generate.generate_score(
            models=models, args=args, task=task, dataset=task.dataset(args.gen_subset)
        )

        total_time = 0.0
        for _ in range(args.runs_per_length):
            scorer, num_sentences, gen_timer, _ = pytorch_translate_generate.generate_score(
                models=models,
                args=args,
                task=task,
                dataset=task.dataset(args.gen_subset),
            )
            total_time += gen_timer.sum
            gen_timer.reset()

        sentences_per_run = args.examples_per_length
        runs = args.runs_per_length
        total_sentences = sentences_per_run * runs
        total_tokens = total_sentences * n

        print(f"--- {n} tokens ---")
        print(f"Generated {total_tokens} tokens ({runs} runs of {sentences_per_run})")
        print(f"Total time: {total_time:.3f} seconds")
        time_per_sentence = total_time / total_sentences
        print(f"Time per sentence: {time_per_sentence:.3f} seconds\n")

    benchmark_length(6)
    benchmark_length(10)
    benchmark_length(20)