Exemple #1
0
    def setup_task(cls, args, **kwargs):
        assert pytorch_translate_data.is_multilingual(
            args
        ), "Must set `--task pytorch_translate_multilingual` for multilingual training"
        args.left_pad_source = options.eval_bool(args.left_pad_source)

        def load_dicts(langs, paths):
            dicts = OrderedDict()
            for lang, dict_path in zip(langs, paths):
                d = pytorch_translate_dictionary.Dictionary.load(dict_path)
                dicts[lang] = d
                print(f"| [{lang}] dictionary: {len(d)} types")
            return dicts

        if not hasattr(args, "multiling_source_vocab_file"):
            args.multiling_encoder_lang = args.multiling_source_lang
            args.multiling_source_vocab_file = [args.source_vocab_file]
        if not hasattr(args, "multiling_target_vocab_file"):
            args.multiling_decoder_lang = args.multiling_target_lang
            args.multiling_target_vocab_file = [args.target_vocab_file]

        # Load dictionaries
        src_dicts = load_dicts(args.multiling_encoder_lang,
                               args.multiling_source_vocab_file)
        tgt_dicts = load_dicts(args.multiling_decoder_lang,
                               args.multiling_target_vocab_file)

        return cls(args, src_dicts, tgt_dicts)
Exemple #2
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)

        assert not pytorch_translate_data.is_multilingual(
            args
        ), "Must set `--task pytorch_translate_multilingual` for multilingual training"

        # Load dictionaries
        source_dict = pytorch_translate_dictionary.Dictionary.load(
            args.source_vocab_file)
        target_dict = pytorch_translate_dictionary.Dictionary.load(
            args.target_vocab_file)

        source_lang = args.source_lang or "src"
        target_lang = args.target_lang or "tgt"

        print(f"| [{source_lang}] dictionary: {len(source_dict)} types")
        print(f"| [{target_lang}] dictionary: {len(target_dict)} types")

        use_char_source = (args.char_source_vocab_file != "") or (getattr(
            args, "arch", "") == "char_source")
        if use_char_source:
            char_source_dict = pytorch_translate_dictionary.Dictionary.load(
                args.char_source_vocab_file)
            # this attribute is used for CharSourceModel construction
            args.char_source_dict_size = len(char_source_dict)
        else:
            char_source_dict = None

        return cls(args, source_dict, target_dict, char_source_dict)
Exemple #3
0
def generate(args):
    pytorch_translate_options.print_args(args)

    # Setup task
    task = tasks.setup_task(args)

    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path.split(":"), task)
    args.source_lang = model_args[0].source_lang
    args.target_lang = model_args[0].target_lang

    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(a.append_eos_to_source == append_eos_to_source
               and a.reverse_source == reverse_source for a in model_args)
    if args.source_binary_file != "":
        assert args.target_binary_file != ""
        task.load_dataset(args.gen_subset, args.source_binary_file,
                          args.target_binary_file)
    elif pytorch_translate_data.is_multilingual(args):
        task.set_encoder_langs(model_args[0].multiling_encoder_lang)
        task.set_decoder_langs(model_args[0].multiling_decoder_lang)
        task.load_dataset_from_text_multilingual(
            args.gen_subset,
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_lang_id=task.get_encoder_lang_id(
                args.multiling_source_lang[0]),
            target_lang_id=task.get_decoder_lang_id(
                args.multiling_target_lang[0]),
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    elif args.source_ensembling:
        task.load_multisource_dataset_from_text(
            args.gen_subset,
            source_text_files=args.source_text_file,
            target_text_file=args.target_text_file,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    else:
        task.load_dataset_from_text(
            args.gen_subset,
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )

    scorer, num_sentences, gen_timer, _ = _generate_score(models=models,
                                                          args=args,
                                                          task=task,
                                                          dataset=task.dataset(
                                                              args.gen_subset))
    print(f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
          f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)")
    print(f"| Generate {args.gen_subset} with beam={args.beam}: "
          f"{scorer.result_string()}")
    return scorer.score()
Exemple #4
0
 def build_model(cls, args, src_dict, dst_dict):
     """Build a new model instance."""
     base_architecture(args)
     if pytorch_translate_data.is_multilingual(args):
         return RNNModel.build_model_multilingual(args, src_dict, dst_dict)
     encoder = RNNModel.build_encoder(args, src_dict)
     decoder = RNNModel.build_decoder(args, src_dict, dst_dict)
     return cls(encoder, decoder)
Exemple #5
0
 def build_model(cls, args, task):
     """Build a new model instance."""
     base_architecture(args)
     # set default value for old checkpoints
     args.left_pad_source = getattr(args, "left_pad_source", True)
     if pytorch_translate_data.is_multilingual(args):
         return RNNModel.build_model_multilingual(args, task)
     src_dict, dst_dict = task.source_dictionary, task.target_dictionary
     encoder = RNNModel.build_encoder(args, src_dict)
     decoder = RNNModel.build_decoder(args, src_dict, dst_dict)
     return cls(task, encoder, decoder)
def preprocess_corpora(args):
    args.train_source_binary_path = maybe_generate_temp_file_path(
        args.train_source_binary_path)
    args.train_target_binary_path = maybe_generate_temp_file_path(
        args.train_target_binary_path)
    args.eval_source_binary_path = maybe_generate_temp_file_path(
        args.eval_source_binary_path)
    args.eval_target_binary_path = maybe_generate_temp_file_path(
        args.eval_target_binary_path)

    # Additional text preprocessing options could be added here before
    # binarizing.
    if pytorch_translate_data.is_multilingual(args):
        preprocess_corpora_multilingual(args)
    else:
        preprocess_corpora_bilingual(args)
Exemple #7
0
def preprocess_corpora(args):
    args.train_source_binary_path = maybe_generate_temp_file_path(
        args.train_source_binary_path
    )
    args.train_target_binary_path = maybe_generate_temp_file_path(
        args.train_target_binary_path
    )
    args.eval_source_binary_path = maybe_generate_temp_file_path(
        args.eval_source_binary_path
    )
    args.eval_target_binary_path = maybe_generate_temp_file_path(
        args.eval_target_binary_path
    )

    # Additional text preprocessing options could be added here before
    # binarizing.
    if pytorch_translate_data.is_multilingual(args):
        preprocess_corpora_multilingual(args)
    else:

        # Vocabs are built before preprocessing because we might need to use
        # both monolingual and bilingual corpora sources to build the vocab
        # (in the case of semisupervised training)
        source_dict, char_source_dict, target_dict = build_vocabs(args=args)

        preprocess_bilingual_corpora(
            args=args,
            source_dict=source_dict,
            char_source_dict=char_source_dict,
            target_dict=target_dict,
        )
        # Binarize additional monolingual corpora for the semisupervised translation
        # task
        if args.task == constants.SEMI_SUPERVISED_TASK:
            args.train_mono_source_binary_path = maybe_generate_temp_file_path(
                output_path=getattr(args, "train_mono_source_binary_path", None)
            )
            args.train_mono_target_binary_path = maybe_generate_temp_file_path(
                output_path=getattr(args, "train_mono_target_binary_path", None)
            )
            preprocess_monolingual_corpora(
                args,
                source_dict=source_dict,
                char_source_dict=char_source_dict,
                target_dict=target_dict,
            )
Exemple #8
0
def _generate_score(models, args, task, dataset, optimize=True):
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    if not args.quiet:
        print("| loading model(s) from {}".format(", ".join(args.path.split(":"))))

    # Optimize ensemble for generation
    if optimize:
        for model in models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
                need_attn=True,
            )

    translator = build_sequence_generator(args, task, models)
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Keep track of translations
    # Initialize with empty translations
    # and zero probs scores
    translated_sentences = [""] * len(dataset)
    translated_scores = [0.0] * len(dataset)

    # Generate and compute BLEU score
    dst_dict = task.target_dictionary
    scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())
    itr = get_eval_itr(args, models, task, dataset)

    num_sentences = 0
    translation_samples = []
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=use_cuda,
            timer=gen_timer,
            prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0,
        )
        if pytorch_translate_data.is_multilingual(args):
            first_best_translations = _iter_first_best_multilingual
        else:
            first_best_translations = _iter_first_best_bilingual
        for trans_info in first_best_translations(
            args, task, dataset, translations, align_dict
        ):
            scorer.add(trans_info.target_tokens, trans_info.hypo_tokens)
            translated_sentences[trans_info.sample_id] = trans_info.hypo_str
            translated_scores[trans_info.sample_id] = trans_info.hypo_score
            translation_samples.append(
                collections.OrderedDict(
                    {
                        "sample_id": trans_info.sample_id.item(),
                        "src_str": trans_info.src_str,
                        "target_str": trans_info.target_str,
                        "hypo_str": trans_info.hypo_str,
                    }
                )
            )
            wps_meter.update(trans_info.src_tokens.size(0))
            t.log({"wps": round(wps_meter.avg)})
            num_sentences += 1

    # If applicable, save the translations to the output file
    # For eg. external evaluation
    if getattr(args, "translation_output_file", False):
        with open(args.translation_output_file, "w") as out_file:
            for hypo_str in translated_sentences:
                print(hypo_str, file=out_file)

    if getattr(args, "translation_probs_file", False):
        with open(args.translation_probs_file, "w") as out_file:
            for hypo_score in translated_scores:
                print(np.exp(hypo_score), file=out_file)

    return scorer, num_sentences, gen_timer, translation_samples
Exemple #9
0
def generate(args):
    pytorch_translate_options.print_args(args)

    src_dict = pytorch_translate_dictionary.Dictionary.load(args.source_vocab_file)
    dst_dict = pytorch_translate_dictionary.Dictionary.load(args.target_vocab_file)
    use_char_source = args.char_source_vocab_file != ""
    if use_char_source:
        char_source_dict = pytorch_translate_dictionary.Dictionary.load(
            args.char_source_vocab_file
        )
        # this attribute is used for CharSourceModel construction
        args.char_source_dict_size = len(char_source_dict)
    else:
        char_source_dict = None

    dataset = data.LanguageDatasets(
        src=args.source_lang, dst=args.target_lang, src_dict=src_dict, dst_dict=dst_dict
    )
    models, model_args = pytorch_translate_utils.load_diverse_ensemble_for_inference(
        args.path, dataset.src_dict, dataset.dst_dict
    )
    append_eos_to_source = model_args[0].append_eos_to_source
    reverse_source = model_args[0].reverse_source
    assert all(
        a.append_eos_to_source == append_eos_to_source
        and a.reverse_source == reverse_source
        for a in model_args
    )
    if args.source_binary_file != "":
        assert args.target_binary_file != ""
        dst_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
            args.target_binary_file
        )
        if use_char_source:
            src_dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file(
                args.source_binary_file
            )
            gen_split = char_data.LanguagePairSourceCharDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=src_dict.pad(),
                eos_idx=dst_dict.eos(),
            )
        else:
            src_dataset = pytorch_translate_data.InMemoryNumpyDataset.create_from_file(
                args.source_binary_file
            )
            gen_split = data.LanguagePairDataset(
                src=src_dataset,
                dst=dst_dataset,
                pad_idx=src_dict.pad(),
                eos_idx=dst_dict.eos(),
            )
    elif pytorch_translate_data.is_multilingual(args):
        gen_split = pytorch_translate_data.make_language_pair_dataset_from_text_multilingual(
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_lang_id=args.multiling_source_lang_id,
            target_lang_id=args.multiling_target_lang_id,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    elif args.source_ensembling:
        gen_split = multisource_data.make_multisource_language_pair_dataset_from_text(
            source_text_files=args.source_text_file,
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
        )
    else:
        gen_split = pytorch_translate_data.make_language_pair_dataset_from_text(
            source_text_file=args.source_text_file[0],
            target_text_file=args.target_text_file,
            source_dict=src_dict,
            target_dict=dst_dict,
            append_eos=append_eos_to_source,
            reverse_source=reverse_source,
            char_source_dict=char_source_dict,
        )
    dataset.splits[args.gen_subset] = gen_split

    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")
    print(f"| {args.gen_subset} {len(dataset.splits[args.gen_subset])} examples")
    scorer, num_sentences, gen_timer, _ = _generate_score(
        models=models, args=args, dataset=dataset, dataset_split=args.gen_subset
    )
    print(
        f"| Translated {num_sentences} sentences ({gen_timer.n} tokens) "
        f"in {gen_timer.sum:.1f}s ({1. / gen_timer.avg:.2f} tokens/s)"
    )
    print(
        f"| Generate {args.gen_subset} with beam={args.beam}: "
        f"{scorer.result_string()}"
    )
    return scorer.score()
Exemple #10
0
def _generate_score(models, args, dataset, dataset_split, optimize=True):
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    if not args.quiet:
        print("| loading model(s) from {}".format(", ".join(args.path)))

    # Optimize ensemble for generation
    if optimize:
        for model in models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if args.no_beamable_mm else args.beam
            )

    # Initialize generator
    model_weights = None
    if args.model_weights:
        model_weights = [float(w.strip()) for w in args.model_weights.split(",")]
    use_char_source = isinstance(models[0], char_source_model.CharSourceModel)
    # Use a different sequence generator in the multisource setting
    if getattr(args, "source_ensembling", False):
        translator_class = multisource_decode.MultiSourceSequenceGenerator
    else:
        translator_class = beam_decode.SequenceGenerator
    translator = translator_class(
        models,
        beam_size=args.beam,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.length_penalty,
        unk_reward=args.unk_reward,
        word_reward=args.word_reward,
        model_weights=model_weights,
        use_char_source=use_char_source,
    )
    if use_cuda:
        translator.cuda()
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Keep track of translations
    # Initialize with empty translations
    # and zero probs scores
    translated_sentences = [""] * len(dataset.splits[dataset_split])
    translated_scores = [0.0] * len(dataset.splits[dataset_split])

    # Generate and compute BLEU score
    scorer = bleu.Scorer(
        dataset.dst_dict.pad(), dataset.dst_dict.eos(), dataset.dst_dict.unk()
    )
    max_positions = min(model.max_encoder_positions() for model in models)
    itr = dataset.eval_dataloader(
        dataset_split,
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
        skip_invalid_size_inputs_valid_test=(args.skip_invalid_size_inputs_valid_test),
    )
    if args.num_shards > 1:
        if args.shard_id < 0 or args.shard_id >= args.num_shards:
            raise ValueError("--shard-id must be between 0 and num_shards")
        itr = data.sharded_iterator(itr, args.num_shards, args.shard_id)

    num_sentences = 0
    translation_samples = []
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        # Keep more detailed timing when invoked from benchmark
        if "keep_detailed_timing" in args:
            gen_timer = pytorch_translate_utils.BucketStopwatchMeter(
                args.increment, args.max_length, args.samples_per_length
            )
        else:
            gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=use_cuda,
            timer=gen_timer,
            prefix_size=1 if pytorch_translate_data.is_multilingual(args) else 0,
        )
        if pytorch_translate_data.is_multilingual(args):
            first_best_translations = _iter_first_best_multilingual
        else:
            first_best_translations = _iter_first_best_bilingual
        for trans_info in first_best_translations(
            args, dataset, dataset_split, translations, align_dict
        ):
            scorer.add(trans_info.target_tokens, trans_info.hypo_tokens)
            translated_sentences[trans_info.sample_id] = trans_info.hypo_str
            translated_scores[trans_info.sample_id] = trans_info.hypo_score
            translation_samples.append(
                collections.OrderedDict(
                    {
                        "sample_id": trans_info.sample_id,
                        "src_str": trans_info.src_str,
                        "target_str": trans_info.target_str,
                        "hypo_str": trans_info.hypo_str,
                    }
                )
            )
            wps_meter.update(trans_info.src_tokens.size(0))
            t.log({"wps": round(wps_meter.avg)})
            num_sentences += 1

    # If applicable, save the translations to the output file
    # For eg. external evaluation
    if getattr(args, "translation_output_file", False):
        with open(args.translation_output_file, "w") as out_file:
            for hypo_str in translated_sentences:
                print(hypo_str, file=out_file)

    if getattr(args, "translation_probs_file", False):
        with open(args.translation_probs_file, "w") as out_file:
            for hypo_score in translated_scores:
                print(np.exp(hypo_score), file=out_file)

    return scorer, num_sentences, gen_timer, translation_samples
Exemple #11
0
def _iter_translations(args,
                       task,
                       dataset,
                       translations,
                       align_dict,
                       rescoring_model=None):
    """Iterate over translations.

    This is a generator function which wraps the beam-search sequence generator,
    performing such work on the output as converting token indices to
    strings, printing output where applicable (not args.quiet), collecting
    oracle translations where applicable, and removing language-ID tokens
    for multilingual translation.

    Args:
        args: Command-line arguments.
        task: FairseqTask object.
        dataset: Dataset set object for a specific split.
        translations: Batched translation iterator, as returned by
            SequenceGenerator.generate_batched_itr().
        align_dict: Dictionary for UNK replacement.

    Yields:
        For each sentence in `translations`, yields a TranslationInfo.
    """
    is_multilingual = pytorch_translate_data.is_multilingual(args)

    for sample_id, src_tokens, target_tokens, hypos in translations:
        # Process input and ground truth
        target_tokens = target_tokens.int().cpu()

        if is_multilingual:
            src_lang_id = (src_tokens[-1] -
                           pytorch_translate_data.MULTILING_DIALECT_ID_OFFSET)
            target_lang_id = (
                target_tokens[0] -
                pytorch_translate_data.MULTILING_DIALECT_ID_OFFSET)

            # remove language ID tokens
            src_tokens = src_tokens[:-1]
            target_tokens = target_tokens[1:]

            # Select dictionaries
            src_dict = task.source_dictionaries[task.get_encoder_lang_code(
                src_lang_id)]
            target_dict = task.target_dictionaries[task.get_decoder_lang_code(
                target_lang_id)]
        else:
            src_dict = task.source_dictionary
            target_dict = task.target_dictionary

        # Either retrieve the original sentences or regenerate them from tokens.
        if align_dict is not None:
            src_str = dataset.src.get_original_text(sample_id)
            target_str = dataset.tgt.get_original_text(sample_id)
        else:
            src_str = src_dict.string(src_tokens, args.remove_bpe)
            target_str = target_dict.string(target_tokens,
                                            args.remove_bpe,
                                            escape_unk=True)

        if not args.quiet:
            print(f"S-{sample_id}\t{src_str}")
            print(f"T-{sample_id}\t{target_str}")

        # used for oracle evaluation (args.report_oracle_bleu)
        best_hypo_tokens = None
        best_hypo_score = 0
        collect_oracle_hypos = args.report_oracle_bleu or (
            args.output_hypos_binary_path and args.nbest > 0)

        # Process top predictions
        for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
            hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                hypo_tokens=hypo["tokens"].int().cpu(),
                src_str=src_str,
                alignment=hypo["alignment"].int().cpu(),
                align_dict=align_dict,
                tgt_dict=task.target_dictionary,
                remove_bpe=args.remove_bpe,
            )

            if not args.quiet:
                print(f"H-{sample_id}\t{hypo['score']}\t{hypo_str}")
                print("A-{}\t{}".format(
                    sample_id,
                    " ".join(map(lambda x: str(utils.item(x)), alignment)),
                ))

            if collect_oracle_hypos:
                score = smoothed_sentence_bleu(task, target_tokens,
                                               hypo_tokens)
                if score > best_hypo_score:
                    best_hypo_tokens = hypo_tokens
                    best_hypo_score = score

            if i == 0:
                if align_dict is not None or args.remove_bpe is not None:
                    # Convert back to tokens for evaluation with unk replacement
                    # and/or without BPE
                    target_tokens = tokenizer.Tokenizer.tokenize(
                        target_str,
                        task.target_dictionary,
                        add_if_not_exist=True)
                # The probs score for the hypo_str; whether it's normalized by
                # sequence length or not depends on normalize_scores, which is
                # set by arg.nonormalize.
                # However, as I tried, whether normalize_scores is set or not,
                # the returned scores are the same (to be investigated).
                # Here, the probs are normalized by hypo length so the value
                # is big enough to be used as weights for backtranslations in
                # dual learning.
                hypo_score = (hypo["score"] / len(hypo_tokens)
                              if len(hypo_tokens) > 0 else 0.0)
                top_hypo_tokens = hypo_tokens

        if not collect_oracle_hypos:
            best_hypo_tokens = top_hypo_tokens

        hypo_tokens_after_rescoring = rescoring.run_rescoring(
            args, task, hypos[:args.nbest], src_tokens, rescoring_model)

        yield TranslationInfo(
            sample_id=sample_id,
            src_tokens=src_tokens,
            target_tokens=target_tokens,
            hypo_tokens=top_hypo_tokens,
            src_str=src_str,
            target_str=target_str,
            hypo_str=hypo_str,
            hypo_score=hypo_score,
            best_hypo_tokens=best_hypo_tokens,
            hypo_tokens_after_rescoring=hypo_tokens_after_rescoring,
        )