コード例 #1
0
def _generate_score(models, args, dataset, dataset_split):
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    if not args.quiet:
        print(f"| loading model(s) from {', '.join(args.path)}")

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam
        )

    # Initialize generator
    translator = beam_decode.SequenceGenerator(
        models,
        beam_size=args.beam,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen,
        unk_penalty=args.unkpen,
        word_reward=args.word_reward,
    )
    if use_cuda:
        translator.cuda()
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Generate and compute BLEU score
    scorer = bleu.Scorer(
        dataset.dst_dict.pad(),
        dataset.dst_dict.eos(),
        dataset.dst_dict.unk(),
    )
    max_positions = min(model.max_encoder_positions() for model in models)
    itr = dataset.eval_dataloader(
        dataset_split,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
        skip_invalid_size_inputs_valid_test=(
            args.skip_invalid_size_inputs_valid_test
        ),
    )
    if args.num_shards > 1:
        if args.shard_id < 0 or args.shard_id >= args.num_shards:
            raise ValueError('--shard-id must be between 0 and num_shards')
        itr = data.sharded_iterator(itr, args.num_shards, args.shard_id)

    num_sentences = 0
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
            cuda=use_cuda, timer=gen_timer)
        for sample_id, src_tokens, target_tokens, hypos in translations:
            # Process input and ground truth
            target_tokens = target_tokens.int().cpu()
            # Either retrieve the original sentences or regenerate them from tokens.
            if align_dict is not None:
                src_str = dataset.splits[dataset_split].src.\
                    get_original_text(sample_id)
                target_str = dataset.splits[dataset_split].dst.\
                    get_original_text(sample_id)
            else:
                src_str = dataset.src_dict.string(src_tokens, args.remove_bpe)
                target_str = dataset.dst_dict.string(
                    target_tokens,
                    args.remove_bpe,
                    escape_unk=True,
                )

            if not args.quiet:
                print(f'S-{sample_id}\t{src_str}')
                print(f'T-{sample_id}\t{target_str}')

            # Process top predictions
            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'].int().cpu(),
                    align_dict=align_dict,
                    dst_dict=dataset.dst_dict,
                    remove_bpe=args.remove_bpe)

                if not args.quiet:
                    print(f"H-{sample_id}\t{hypo['score']}\t{hypo_str}")
                    print(f"A-{sample_id}\t{' '.join(map(lambda x: str(utils.item(x)), alignment))}")


                # Score only the top hypothesis
                if i == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement
                        # and/or without BPE
                        target_tokens = tokenizer.Tokenizer.tokenize(
                            target_str,
                            dataset.dst_dict,
                            add_if_not_exist=True,
                        )
                    scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    return scorer, num_sentences, gen_timer
コード例 #2
0
ファイル: generate.py プロジェクト: zbn123/translate
def _generate_score(models, args, task, dataset):
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    if not args.quiet:
        print("| loading model(s) from {}".format(", ".join(
            args.path.split(":"))))

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=True,
        )

    translator = build_sequence_generator(args, task, models)
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    print("seed number is" + str(args.max_examples_to_evaluate_seed))
    if args.max_examples_to_evaluate > 0:
        pytorch_translate_data.subsample_pair_dataset(
            dataset, args.max_examples_to_evaluate,
            args.max_examples_to_evaluate_seed)

    # Keep track of translations
    # Initialize with empty translations
    # and zero probs scores
    translated_sentences = [""] * len(dataset)
    translated_scores = [0.0] * len(dataset)
    hypos_list = []

    collect_output_hypos = getattr(args, "output_hypos_binary_path", False)
    if collect_output_hypos:
        output_hypos_token_arrays = [None] * len(dataset)

    # Generate and compute BLEU score
    dst_dict = task.target_dictionary
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())

    itr = task.get_batch_iterator(
        dataset=dataset,
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    oracle_scorer = None
    if args.report_oracle_bleu:
        oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                                    dst_dict.unk())

    rescorer = None
    num_sentences = 0
    translation_samples = []
    translation_info_list = []
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=use_cuda,
            timer=gen_timer,
            prefix_size=1
            if pytorch_translate_data.is_multilingual_many_to_one(args) else 0,
        )

        for trans_info in _iter_translations(args, task, dataset, translations,
                                             align_dict, rescorer):
            if hasattr(scorer, "add_string"):
                scorer.add_string(trans_info.target_str, trans_info.hypo_str)
            else:
                scorer.add(trans_info.target_tokens, trans_info.hypo_tokens)
            if oracle_scorer is not None:
                oracle_scorer.add(trans_info.target_tokens,
                                  trans_info.best_hypo_tokens)

            if getattr(args, "translation_output_file", False):
                translated_sentences[
                    trans_info.sample_id] = trans_info.hypo_str
            if getattr(args, "translation_probs_file", False):
                translated_scores[trans_info.sample_id] = trans_info.hypo_score
            if getattr(args, "hypotheses_export_path", False):
                hypos_list.append(trans_info.hypos)
            if collect_output_hypos:
                output_hypos_token_arrays[
                    trans_info.sample_id] = trans_info.best_hypo_tokens
            if args.translation_info_export_path is not None:
                # Strip expensive data from hypotheses before saving
                hypos = [{
                    k: v
                    for k, v in hypo.items() if k in ["tokens", "score"]
                } for hypo in trans_info.hypos]
                # Make sure everything is on cpu before exporting
                hypos = [{
                    "score": hypo["score"],
                    "tokens": hypo["tokens"].cpu()
                } for hypo in hypos]
                translation_info_list.append({
                    "src_tokens":
                    trans_info.src_tokens.cpu(),
                    "target_tokens":
                    trans_info.target_tokens,
                    "hypos":
                    hypos,
                })
            translation_samples.append(
                collections.OrderedDict({
                    "sample_id":
                    trans_info.sample_id.item(),
                    "src_str":
                    trans_info.src_str,
                    "target_str":
                    trans_info.target_str,
                    "hypo_str":
                    trans_info.hypo_str,
                }))
            wps_meter.update(trans_info.src_tokens.size(0))
            t.log({"wps": round(wps_meter.avg)})
            num_sentences += 1

    # If applicable, save collected hypothesis tokens to binary output file
    if collect_output_hypos:
        output_dataset = pytorch_translate_data.InMemoryIndexedDataset()
        output_dataset.load_from_sequences(output_hypos_token_arrays)
        output_dataset.save(args.output_hypos_binary_path)
    if args.output_source_binary_path:
        dataset.src.save(args.output_source_binary_path)
    if args.translation_info_export_path is not None:
        f = open(args.translation_info_export_path, "wb")
        pickle.dump(translation_info_list, f)
        f.close()

    # If applicable, save the translations and scores to the output files
    # These two ouputs are used in dual learning for weighted backtranslation
    if getattr(args, "translation_output_file", False):
        with open(args.translation_output_file, "w") as out_file:
            for hypo_str in translated_sentences:
                print(hypo_str, file=out_file)

    if getattr(args, "translation_probs_file", False):
        with open(args.translation_probs_file, "w") as out_file:
            for hypo_score in translated_scores:
                print(np.exp(hypo_score), file=out_file)

    # For eg. external evaluation
    if getattr(args, "hypotheses_export_path", False):
        with open(args.hypotheses_export_path, "w") as out_file:
            for hypos in hypos_list:
                for hypo in hypos:
                    print(
                        task.tgt_dict.string(hypo["tokens"],
                                             bpe_symbol=args.remove_bpe),
                        file=out_file,
                    )

    if oracle_scorer is not None:
        print(
            f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}"
        )

    return scorer, num_sentences, gen_timer, translation_samples
コード例 #3
0
ファイル: generate.py プロジェクト: StuartCHAN/KARL
def main(args):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    utils.import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    #pickle.dump(src_dict, open("./translations/seethetensors/src_dict.pkl", "bw") )
    #pickle.dump(tgt_dict, open("./translations/seethetensors/tgt_dict.pkl", "bw") )

    #print("* args.remove_bpe : ", args.remove_bpe)
    #bpe_symbol = args.remove_bpe
    #pickle.dump(bpe_symbol, open("./translations/seethetensors/bpe_symbol.pkl", "bw") )

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        args.path.split(':'),
        arg_overrides=eval(args.model_overrides),
        task=task,
        bert_ratio=args.bert_ratio if args.change_ratio else None,
        encoder_ratio=args.encoder_ratio if args.change_ratio else None,
        geargs=args,
    )

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)
    #pickle.dump(align_dict,  open("./translations/seethetensors/align_dict.pkl", "bw"))
    # Load dataset (possibly sharded)

    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    resdict = {}
    #hypo_strings = [] ##!!!
    #src_strings = [] ##!!!
    results_path = args.results_path
    stamp = str(time.time())
    resfp = results_path + "/" + args.gen_subset + "." + stamp + ".gen_sparql.json"
    #resfp_ = results_path+"/"+args.gen_subset+"."+stamp+".txt"
    #sampfp = results_path+"/"+args.gen_subset+"."+stamp+".sampleids.txt"

    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        for sample in t:
            sample = utils.move_to_cuda(sample) if use_cuda else sample
            if 'net_input' not in sample:
                continue

            prefix_tokens = None
            if args.prefix_size > 0:
                prefix_tokens = sample['target'][:, :args.prefix_size]

            gen_timer.start()
            hypos = task.inference_step(generator, models, sample,
                                        prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            #open(sampfp, "w", encoding="UTF-8").writeline(str(sample['id'].tolist()))
            for i, sample_id in enumerate(sample['id'].tolist()):
                has_target = sample['target'] is not None

                # Remove padding
                src_tokens = utils.strip_pad(
                    sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()

                # Either retrieve the original sentences or regenerate them from tokens.
                if align_dict is not None:
                    src_str = task.dataset(
                        args.gen_subset).src.get_original_text(sample_id)
                    target_str = task.dataset(
                        args.gen_subset).tgt.get_original_text(sample_id)
                else:
                    if src_dict is not None:
                        src_str = src_dict.string(src_tokens, args.remove_bpe)
                    else:
                        src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)
                #src_strings.append(src_str) ##!!!
                if not args.quiet:
                    if src_dict is not None:
                        print('S-{}\t{}'.format(sample_id, src_str))
                    if has_target:
                        print('T-{}\t{}'.format(sample_id, target_str))

                # Process top predictions
                for i, hypo in enumerate(
                        hypos[i][:min(len(hypos), args.nbest)]):
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str=src_str,
                        alignment=hypo['alignment'].int().cpu()
                        if hypo['alignment'] is not None else None,
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )
                    #hypo_strings.append(hypo_str)  ##!!!
                    resdict[str(int(sample_id) + 1)] = {
                        "sparql": interprete(hypo_str),
                        "en": src_str
                    }  ##!!!
                    if not args.quiet:
                        print('H-{}\t{}\t{}'.format(sample_id, hypo['score'],
                                                    hypo_str))
                        print('P-{}\t{}'.format(
                            sample_id, ' '.join(
                                map(
                                    lambda x: '{:.4f}'.format(x),
                                    hypo['positional_scores'].tolist(),
                                ))))

                        if args.print_alignment:
                            print('A-{}\t{}'.format(
                                sample_id, ' '.join(
                                    map(lambda x: str(utils.item(x)),
                                        alignment))))

                    # Score only the top hypothesis
                    if has_target and i == 0:
                        if align_dict is not None or args.remove_bpe is not None:
                            # Convert back to tokens for evaluation with unk replacement and/or without BPE
                            target_tokens = tgt_dict.encode_line(
                                target_str, add_if_not_exist=True)
                        if hasattr(scorer, 'add_string'):
                            scorer.add_string(target_str, hypo_str)
                        else:
                            scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(num_generated_tokens)
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += sample['nsentences']

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.result_string()))
    """
    with open(resfp, "a", encoding="UTF-8") as restore :
        for gen_str in hypo_strings:
            restore.write(gen_str+" \n")
        restore.close() 
    with open(resfp_, "a", encoding="UTF-8") as res_tore :
        for src_str in src_strings:
            res_tore.write(src_str+" \n")
        res_tore.close()"""
    with open(resfp, "w", encoding="UTF-8") as restore:
        json.dump(resdict, restore, ensure_ascii=False, indent=4)
        restore.close()

    return scorer
コード例 #4
0
    def forward(
        self,
        sample,
        forward_model,
        forward_optimizer,
        tgt_dict,
        backward_model,
        backward_optimizer,
        src_dict,
        lm_scorer=None,
        reduce=True,
        **generate_kwargs,
    ):
        """Compute the reconstruction and LM loss from forward and backward
        models.

        Args:
            sample: original input.
            hypos: psudo labels generated by the forward model. They are used
                as approximation of the target space to do importance sampling.
            forward_model: the model used to generate psuedo labels.
            backward_model: the model to reconstruct original input using
                psuedo labels.
            lm_scorer: an LM model eval mode to score psuedo labels in target
                space.
        """
        # Generate translations
        nbest_translations = self._generate_translation(
            forward_model, tgt_dict, sample, self.args.beam, **generate_kwargs)

        forward_samples = []
        backward_samples = {}
        # TODO (T36875783): load pretrained lm to score
        lm_score = 0.0
        for sample_id, src_processed, tgt_hypos in nbest_translations:
            # compute each model's reward
            forward_reward = lm_score
            # construct the sample; compute the ce loss
            # backward_samples need to handle EOS
            src = self._maybe_reverse_source(src_processed)
            src = self._maybe_add_eos(src, src_dict.eos())
            assert len(tgt_hypos) == self.args.beam
            for tgt_hypo_i, tgt_hypo_struct in enumerate(tgt_hypos):
                dual_sample_id = sample_id.item() * self.args.beam + tgt_hypo_i
                tgt_hypo = tgt_hypo_struct["tokens"]
                # add EOS to the target, i.e. original source, since it'll be used
                # as target
                # remove EOS in the src is optional
                if self.remove_eos_at_src:
                    tgt_hypo = tgt_hypo[:-1]
                tgt_hypo_processed = self._maybe_reverse_source(tgt_hypo)

                backward_sample = {
                    "id": dual_sample_id,
                    "source": tgt_hypo_processed.cpu(),
                    "target": src.cpu(),
                    "weight": 1.0 - self.alpha,
                }
                assert dual_sample_id not in backward_samples
                backward_samples[dual_sample_id] = backward_sample

        bwd_model_input = utils.move_to_cuda(
            WeightedLanguagePairDataset.collate(
                samples=list(backward_samples.values()),
                pad_idx=src_dict.pad(),
                eos_idx=src_dict.eos(),
            ))
        reconstructed_source = self._generate_translation(
            backward_model, src_dict, bwd_model_input, 1, **generate_kwargs)
        for dual_sample_id, tgt_hypo_processed, src_hypos in reconstructed_source:
            backward_sample = backward_samples[dual_sample_id.item()]
            src = backward_sample["target"]
            tgt_hypo = self._maybe_reverse_source(tgt_hypo_processed)

            # use bleu score as reward
            scorer = bleu.Scorer(src_dict.pad(), src_dict.eos(),
                                 src_dict.unk())
            assert len(src_hypos) == 1
            src_hypo = src_hypos[0]["tokens"][:-1]
            scorer.add(src.int().cpu(), src_hypo.int().cpu())
            backward_reward = (
                scorer.score(order=self.args.reconstruction_bleu_order) /
                100.0)

            original_stc = " ".join(src_dict[tid] for tid in src.tolist())
            translated_stc = " ".join(tgt_dict[tid] for tid in tgt_hypo)
            recon_stc = " ".join(src_dict[tid] for tid in src_hypo.tolist())

            if int(dual_sample_id / self.args.beam) % 100 == 0:
                print("--------")
                print(
                    "original sentence:",
                    original_stc.replace(self.args.source_bpe_end_marker, ""),
                )
                print(
                    "translated sentence:",
                    translated_stc.replace(self.args.source_bpe_end_marker,
                                           ""),
                )
                print(
                    "reconstructed sentence:",
                    recon_stc.replace(self.args.source_bpe_end_marker, ""),
                )
                print("reward:", backward_reward)
                print("--------")

            total_reward = (self.alpha * forward_reward +
                            (1.0 - self.alpha) * backward_reward)
            src_processed = self._maybe_reverse_source(src)
            tgt_hypo = self._maybe_add_eos(tgt_hypo, tgt_dict.eos())
            forward_samples.append({
                "id": dual_sample_id,
                "source": src_processed.cpu(),
                "target": tgt_hypo.cpu(),  # first hypo is best hypo
                "weight": total_reward,
            })

        # Now combine pseudo labelled examples to corresponding batch with
        # rewards factored to weighting of each task's loss
        agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {}
        forward_model.train()
        forward_loss, sample_size, logging_output = self.task.criterion(
            forward_model,
            utils.move_to_cuda(
                WeightedLanguagePairDataset.collate(
                    samples=forward_samples,
                    pad_idx=tgt_dict.pad(),
                    eos_idx=tgt_dict.eos(),
                )),
        )
        agg_loss += forward_loss.detach().item()
        agg_sample_size += sample_size
        agg_logging_output["primal"] = logging_output
        # grad would be further scaled when passed back to trainer,
        # which will do the update
        forward_optimizer.backward(forward_loss)

        backward_model.train()
        backward_loss, sample_size, logging_output = self.task.criterion(
            backward_model, bwd_model_input)

        agg_loss += backward_loss.data.item()
        agg_sample_size += sample_size
        agg_logging_output["dual"] = logging_output
        backward_optimizer.backward(backward_loss)
        return agg_loss, agg_sample_size, agg_logging_output
コード例 #5
0
def _generate_score(models, args, task, dataset, optimize=True):
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    if not args.quiet:
        print("| loading model(s) from {}".format(", ".join(
            args.path.split(":"))))

    # Optimize ensemble for generation
    if optimize:
        for model in models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None
                if args.no_beamable_mm else args.beam,
                need_attn=True,
            )

    translator = build_sequence_generator(args, task, models)
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Keep track of translations
    # Initialize with empty translations
    # and zero probs scores
    translated_sentences = [""] * len(dataset)
    translated_scores = [0.0] * len(dataset)

    collect_output_hypos = getattr(args, "output_hypos_binary_path", False)
    if collect_output_hypos:
        output_hypos_token_arrays = [None] * len(dataset)

    # Generate and compute BLEU score
    dst_dict = task.target_dictionary
    scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())
    itr = get_eval_itr(args, models, task, dataset)

    oracle_scorer = None
    if args.report_oracle_bleu:
        oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                                    dst_dict.unk())

    num_sentences = 0
    translation_samples = []
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=use_cuda,
            timer=gen_timer,
            prefix_size=1
            if pytorch_translate_data.is_multilingual(args) else 0,
        )

        for trans_info in _iter_translations(args, task, dataset, translations,
                                             align_dict):
            scorer.add(trans_info.target_tokens, trans_info.hypo_tokens)
            if oracle_scorer is not None:
                oracle_scorer.add(trans_info.target_tokens,
                                  trans_info.best_hypo_tokens)

            translated_sentences[trans_info.sample_id] = trans_info.hypo_str
            translated_scores[trans_info.sample_id] = trans_info.hypo_score
            if collect_output_hypos:
                output_hypos_token_arrays[
                    trans_info.sample_id] = trans_info.best_hypo_tokens
            translation_samples.append(
                collections.OrderedDict({
                    "sample_id":
                    trans_info.sample_id.item(),
                    "src_str":
                    trans_info.src_str,
                    "target_str":
                    trans_info.target_str,
                    "hypo_str":
                    trans_info.hypo_str,
                }))
            wps_meter.update(trans_info.src_tokens.size(0))
            t.log({"wps": round(wps_meter.avg)})
            num_sentences += 1

    # If applicable, save collected hypothesis tokens to binary output file
    if collect_output_hypos:
        output_dataset = pytorch_translate_data.InMemoryNumpyDataset()
        output_dataset.load_from_sequences(output_hypos_token_arrays)
        output_dataset.save(args.output_hypos_binary_path)

    # If applicable, save the translations to the output file
    # For eg. external evaluation
    if getattr(args, "translation_output_file", False):
        with open(args.translation_output_file, "w") as out_file:
            for hypo_str in translated_sentences:
                print(hypo_str, file=out_file)

    if getattr(args, "translation_probs_file", False):
        with open(args.translation_probs_file, "w") as out_file:
            for hypo_score in translated_scores:
                print(np.exp(hypo_score), file=out_file)

    if oracle_scorer is not None:
        print(
            f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}"
        )

    return scorer, num_sentences, gen_timer, translation_samples
コード例 #6
0
def random_search(scores_info_export_path,
                  num_trials,
                  report_oracle_bleu=False):
    with open(scores_info_export_path, "rb") as f:
        scores_info = pickle.load(f)

    dummy_task = DummyTask()

    if report_oracle_bleu:
        oracle_scorer = bleu.Scorer(vocab_constants.PAD_ID,
                                    vocab_constants.EOS_ID,
                                    vocab_constants.UNK_ID)

        for example in scores_info:
            smoothed_bleu = []
            for hypo in example["hypos"]:
                eval_score = smoothed_sentence_bleu(
                    dummy_task,
                    torch.IntTensor(example["target_tokens"]),
                    torch.IntTensor(hypo),
                )
                smoothed_bleu.append(eval_score)
            best_hypo_ind = np.argmax(smoothed_bleu)
            example["best_hypo_ind"] = best_hypo_ind

            oracle_scorer.add(
                torch.IntTensor(example["target_tokens"]),
                torch.IntTensor(example["hypos"][best_hypo_ind]),
            )

        print("oracle BLEU: ", oracle_scorer.score())

    num_features = scores_info[0]["scores"].shape[1]
    assert all(
        example["scores"].shape[1] == num_features for example in
        scores_info), "All examples must have the same number of scores!"
    feature_weights = np.zeros(num_features)
    feature_weights[0] = 1
    score = evaluate_weights(scores_info, feature_weights, length_penalty=1)
    print("base BLEU: ", score)
    best_score = score
    best_weights = feature_weights
    best_length_penalty = 0

    nonzero_features = identify_nonzero_features(scores_info)

    for i in range(num_trials):
        feature_weights = np.zeros(num_features)
        random_weights = np.random.dirichlet(np.ones(nonzero_features.size))
        feature_weights[nonzero_features] = random_weights
        length_penalty = 1.5 * np.random.random()

        score = evaluate_weights(scores_info, feature_weights, length_penalty)
        if score > best_score:
            best_score = score
            best_weights = feature_weights
            best_length_penalty = length_penalty

        print(f"\r[{i}]  best: {best_score}", end="", flush=True)

    print()
    print("best weights: ", best_weights)
    print("best length penalty: ", length_penalty)

    return best_weights, best_length_penalty, best_score
コード例 #7
0
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile,
                      write_hypos, normalize):

    print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c)
    gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files(
        args)
    dict = dictionary.Dictionary()
    scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())

    ordered_hypos = {}
    ordered_targets = {}

    for shard_id in range(len(bitext1_lst)):
        bitext1 = bitext1_lst[shard_id]
        bitext2 = bitext2_lst[shard_id]
        gen_output = gen_output_lst[shard_id]
        lm_res = lm_res_lst[shard_id]

        total = len(bitext1.rescore_source.keys())
        source_lst = []
        hypo_lst = []
        score_lst = []
        reference_lst = []
        j = 1
        best_score = -math.inf

        for i in range(total):
            # length is measured in terms of words, not bpe tokens, since models may not share the same bpe
            target_len = len(bitext1.rescore_hypo[i].split())

            if lm_res is not None:
                lm_score = lm_res.score[i]
            else:
                lm_score = 0

            if bitext2 is not None:
                bitext2_score = bitext2.rescore_score[i]
                bitext2_backwards = bitext2.backwards
            else:
                bitext2_score = None
                bitext2_backwards = None

            score = rerank_utils.get_score(a,
                                           b,
                                           c,
                                           target_len,
                                           bitext1.rescore_score[i],
                                           bitext2_score,
                                           lm_score=lm_score,
                                           lenpen=lenpen,
                                           src_len=bitext1.source_lengths[i],
                                           tgt_len=bitext1.target_lengths[i],
                                           bitext1_backwards=bitext1.backwards,
                                           bitext2_backwards=bitext2_backwards,
                                           normalize=normalize)

            if score > best_score:
                best_score = score
                best_hypo = bitext1.rescore_hypo[i]

            if j == gen_output.num_hypos[i] or j == args.num_rescore:
                j = 1
                hypo_lst.append(best_hypo)
                score_lst.append(best_score)
                source_lst.append(bitext1.rescore_source[i])
                reference_lst.append(bitext1.rescore_target[i])

                best_score = -math.inf
                best_hypo = ""
            else:
                j += 1

        gen_keys = list(sorted(gen_output.no_bpe_target.keys()))

        for key in range(len(gen_keys)):
            if args.prefix_len is None:
                assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \
                    ("pred and rescore hypo mismatch: i: " + str(key) + ", " + str(hypo_lst[key]) + str(gen_keys[key]) +
                    str(gen_output.no_bpe_hypo[key]))
                sys_tok = dict.encode_line(hypo_lst[key])
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                scorer.add(ref_tok, sys_tok)

            else:
                full_hypo = rerank_utils.get_full_from_prefix(
                    hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                sys_tok = dict.encode_line(full_hypo)
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                scorer.add(ref_tok, sys_tok)

        # if only one set of hyper parameters is provided, write the predictions to a file
        if write_hypos:
            # recover the orinal ids from n best list generation
            for key in range(len(gen_output.no_bpe_target)):
                if args.prefix_len is None:
                    assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \
                        "pred and rescore hypo mismatch:"+"i:"+str(key)+str(hypo_lst[key]) + str(gen_output.no_bpe_hypo[key])
                    ordered_hypos[gen_keys[key]] = hypo_lst[key]
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]

                else:
                    full_hypo = rerank_utils.get_full_from_prefix(
                        hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                    ordered_hypos[gen_keys[key]] = full_hypo
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]

    # write the hypos in the original order from nbest list generation
    if args.num_shards == (len(bitext1_lst)):
        with open(target_outfile, 'w') as t:
            with open(hypo_outfile, 'w') as h:
                for key in range(len(ordered_hypos)):
                    t.write(ordered_targets[key])
                    h.write(ordered_hypos[key])

    res = scorer.result_string(4)
    if write_hypos:
        print(res)
    score = rerank_utils.parse_bleu_scoring(res)
    return score
コード例 #8
0
def main(args):
    utils.import_user_module(args)

    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu
    use_ctc_loss = True if args.criterion == 'ctc_loss' else False

    # Setup task, e.g., image captioning
    task = tasks.setup_task(args)
    # Load dataset split
    task.load_dataset(args.gen_subset, combine=True, epoch=0)

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    model_paths = args.path.split(':')
    models, _model_args = checkpoint_utils.load_model_ensemble(
        model_paths,
        arg_overrides=eval(args.model_overrides),
        task=task,
    )

    # Set dictionaries
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]
        ),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())

    stats = collections.OrderedDict()
    num_sentences = 0
    num_correct = 0
    has_target = True

    with progress_bar.build_progress_bar(
        args, itr,
        prefix='inference on \'{}\' subset'.format(args.gen_subset),
        no_progress_bar='simple',
    ) as progress:
        wps_meter = TimeMeter()
        for sample in progress:
            sample = utils.move_to_cuda(sample) if use_cuda else sample
            gen_timer.start()
            hypos = task.inference_step(generator, models, sample)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            for i, sample_id in enumerate(sample['id'].tolist()):
                has_target = sample['target'] is not None
                target_tokens = None
                if has_target:
                    if use_ctc_loss:
                        target_tokens = sample['target'][i]
                        target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True)
                    else:
                        # Remove padding
                        target_tokens = utils.strip_pad(sample['target'][i, :], tgt_dict.pad()).int().cpu()
                        # Regenerate original sentences from tokens.
                        target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True)

                if not args.quiet:
                    if has_target:
                        print('\nT-{}\t{}'.format(sample_id, target_str))

                # Process top predictions
                hypo = hypos[i][0]
                hypo_tokens = hypo['tokens'] if use_ctc_loss else hypo['tokens'].int().cpu()
                hypo_str = tgt_dict.string(hypo_tokens, args.remove_bpe, escape_unk=True)
                alignment = hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None

                if hypo_str == target_str:
                    num_correct += 1

                if not args.quiet:
                    print('H-{}\t{}\t{}'.format(sample_id, hypo_str, hypo['score']))
                    print('P-{}\t{}'.format(
                        sample_id,
                        ' '.join(map(
                            lambda x: '{:.4f}'.format(x),
                            hypo['positional_scores'].tolist(),
                        )) if not use_ctc_loss else None
                    ))

                    if args.print_alignment:
                        print('A-{}\t{}'.format(
                            sample_id,
                            ' '.join(map(lambda x: str(utils.item(x)), alignment))
                        ))

                # Score only the top hypothesis
                if has_target:
                    if hasattr(scorer, 'add_string'):
                        scorer.add_string(target_str, hypo_str)
                    else:
                        scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(num_generated_tokens)
            num_sentences += sample['nsentences']
            stats['wps'] = round(wps_meter.avg)
            stats['acc'] = num_correct / num_sentences
            progress.log(stats, tag='accuracy')
        progress.print(stats, tag='accuracy')

    print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format(
        num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
    return scorer
コード例 #9
0
def main(args):
    utils.import_user_module(args)

    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    logger.info(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    logger.info('loading model(s) from {}'.format(args.path))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        args.path.split(os.pathsep),
        arg_overrides=eval(args.model_overrides),
        task=task,
    )

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Handle tokenization and BPE
    tokenizer = encoders.build_tokenizer(args)
    bpe = encoders.build_bpe(args)

    def encode_fn(x):
        if tokenizer is not None:
            x = tokenizer.encode(x)
        if bpe is not None:
            x = bpe.encode(x)
        return x

    def decode_fn(x):
        if bpe is not None:
            x = bpe.decode(x)
        if tokenizer is not None:
            x = tokenizer.decode(x)
        return x

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    max_positions = utils.resolve_max_positions(
        task.max_positions(), *[model.max_positions() for model in models])

    num_sentences = 0
    if args.buffer_size > 1:
        logger.info('Sentence buffer size: %s', args.buffer_size)
    logger.info('NOTE: hypothesis and token scores are output in base 2')
    logger.info('Type the input sentence and press return:')
    start_id = 0
    for inputs in buffered_read(args.input, args.buffer_size):
        results = []
        for batch in make_batches(inputs, args, task, max_positions,
                                  encode_fn):
            src_tokens = batch.src_tokens
            src_lengths = batch.src_lengths
            tgt_tokens = batch.tgt_tokens
            num_sentences += src_tokens[0].size(0)
            if use_cuda:
                if isinstance(src_tokens, list):
                    src_tokens = [tokens.cuda() for tokens in src_tokens]
                    src_lengths = [lengths.cuda() for lengths in src_lengths]
                else:
                    src_tokens = src_tokens.cuda()
                    src_lengths = src_lengths.cuda()

            sample = {
                'net_input': {
                    'src_tokens': src_tokens,
                    'src_lengths': src_lengths,
                },
                'target': tgt_tokens,
            }

            gen_timer.start()
            translations = task.inference_step(generator, models, sample)
            num_generated_tokens = sum(
                len(h[0]['tokens']) for h in translations)
            gen_timer.stop(num_generated_tokens)

            for i, (id,
                    hypos) in enumerate(zip(batch.ids.tolist(), translations)):
                src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad())
                tgt_tokens_i = None
                if tgt_tokens is not None:
                    tgt_tokens_i = utils.strip_pad(tgt_tokens[i, :],
                                                   tgt_dict.pad()).int().cpu()
                results.append(
                    (start_id + id, src_tokens_i, hypos, tgt_tokens_i))

        # sort output to match input order
        for id, src_tokens, hypos, tgt_tokens in sorted(results,
                                                        key=lambda x: x[0]):
            if src_dict is not None:
                src_str = src_dict.string(src_tokens, args.remove_bpe)
                print('S-{}\t{}'.format(id, src_str))

            if tgt_tokens is not None:
                tgt_str = tgt_dict.string(tgt_tokens,
                                          args.remove_bpe,
                                          escape_unk=True)
                print('T-{}\t{}'.format(id, tgt_str))

            # Process top predictions
            for j, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'],
                    align_dict=align_dict,
                    tgt_dict=tgt_dict,
                    remove_bpe=args.remove_bpe,
                )
                hypo_str = decode_fn(hypo_str)
                score = hypo['score'] / math.log(2)  # convert to base 2
                print('H-{}\t{}\t{}'.format(id, score, hypo_str))
                print('P-{}\t{}'.format(
                    id,
                    ' '.join(
                        map(
                            lambda x: '{:.4f}'.format(x),
                            # convert from base e to base 2
                            hypo['positional_scores'].div_(math.log(2)
                                                           ).tolist(),
                        ))))
                if args.print_alignment:
                    alignment_str = " ".join(
                        ["{}-{}".format(src, tgt) for src, tgt in alignment])
                    print('A-{}\t{}'.format(id, alignment_str))
                if args.print_step:
                    print('I-{}\t{}'.format(id, hypo['steps']))
                    print('O-{}\t{}'.format(id, hypo['num_ops']))

                if getattr(args, 'retain_iter_history', False):
                    for step, h in enumerate(hypo['history']):
                        _, h_str, _ = utils.post_process_prediction(
                            hypo_tokens=h['tokens'].int().cpu(),
                            src_str=src_str,
                            alignment=None,
                            align_dict=None,
                            tgt_dict=tgt_dict,
                            remove_bpe=None,
                        )
                        print('E-{}_{}\t{}'.format(id, step, h_str))

                # Score only the top hypothesis
                if tgt_tokens is not None and j == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        tgt_tokens = tgt_dict.encode_line(
                            tgt_str, add_if_not_exist=True)
                    if hasattr(scorer, 'add_string'):
                        scorer.add_string(tgt_str, hypo_str)
                    else:
                        scorer.add(tgt_tokens, hypo_tokens)

        # update running id counter
        start_id += len(inputs)

    logger.info(
        'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if args.has_target:
        logger.info('Generate with beam={}: {}'.format(args.beam,
                                                       scorer.result_string()))
コード例 #10
0
def main(args):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset, args=args)
    print('| {} {} {} examples'.format(args.data, args.gen_subset,
                                       len(task.dataset(args.gen_subset))))

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    args.unk_idx = task.src_dict.indices['<unk>']
    args.dict_len = task.src_dict.indices.__len__()
    if '[APPEND]' in task.src_dict.indices.keys():
        args.APPEND_ID = task.src_dict.indices['[APPEND]']
        print("[APPEND] ID: {}".format(args.APPEND_ID))
    else:
        args.APPEND_ID = -1
    if '[SRC]' in task.src_dict.indices.keys():
        args.SRC_ID = task.src_dict.indices['[SRC]']
        print("[SRC] ID: {}".format(args.SRC_ID))
    else:
        args.SRC_ID = -1
    if '[TGT]' in task.src_dict.indices.keys():
        args.TGT_ID = task.src_dict.indices['[TGT]']
        print("[TGT] ID: {}".format(args.TGT_ID))
    else:
        args.TGT_ID = -1
    if '[SEP]' in task.src_dict.indices.keys():
        args.SEP_ID = task.src_dict.indices['[SEP]']
        print("[SEP] ID: {}".format(args.SEP_ID))
    else:
        args.SEP_ID = -1
    if '</s>' in task.src_dict.indices.keys():
        args.EOS_ID = task.src_dict.indices['</s>']
    else:
        args.EOD_ID = -1
    if '<pad>' in task.src_dict.indices.keys():
        args.PAD_ID = task.src_dict.indices['<pad>']
    else:
        args.PAD_ID = -1

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _model_args = utils.load_ensemble_for_inference(
        args.path.split(':'),
        task,
        model_arg_overrides=eval(args.model_overrides),
    )
    _model_args.avgpen = args.avgpen
    task.datasets['test'].args = _model_args

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    select_retrieve_tokens = []
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        trans_results = []
        for sample in t:
            sample = utils.move_to_cuda(sample) if use_cuda else sample
            if 'net_input' not in sample:
                continue

            prefix_tokens = None
            if args.prefix_size > 0:
                prefix_tokens = sample['target'][:, :args.prefix_size]

            gen_timer.start()
            hypos, encoder_outs = task.inference_step(generator, models,
                                                      sample, prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            for i, sample_id in enumerate(sample['id'].tolist()):
                has_target = sample['target'] is not None

                # Remove padding
                src_tokens, retrieve_source_tokens, retrieve_target_tokens = sample[
                    'net_input']['src_tokens']
                retrieve_tokens = list(
                    itertools.chain.from_iterable(
                        zip(retrieve_source_tokens, retrieve_target_tokens)))
                retrieve_tokens = torch.cat(retrieve_tokens, dim=1)
                all_tokens = torch.cat([src_tokens, retrieve_tokens], dim=1)
                src_tokens = utils.strip_pad(all_tokens[i, :], tgt_dict.pad())
                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()

                #

                # Either retrieve the original sentences or regenerate them from tokens.
                if align_dict is not None:
                    src_str = task.dataset(
                        args.gen_subset).src.get_original_text(sample_id)
                    target_str = task.dataset(
                        args.gen_subset).tgt.get_original_text(sample_id)
                else:
                    if src_dict is not None:
                        src_str = src_dict.string(src_tokens, args.remove_bpe)
                    else:
                        src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

                if not args.quiet:
                    if src_dict is not None:
                        print('S-{}\t{}'.format(sample_id, src_str))
                    if has_target:
                        print('T-{}\t{}'.format(sample_id, target_str))

                # add select tokens
                select_retrieve_tokens.append([
                    sample_id, src_str, target_str,
                    sample['predict_ground_truth'][i, :],
                    retrieve_tokens[i, :],
                    encoder_outs[0]['new_retrieve_tokens'][i, :],
                    utils.strip_pad(retrieve_tokens[i, :],
                                    src_dict.pad()).tolist(),
                    utils.strip_pad(
                        encoder_outs[0]['new_retrieve_tokens'][i, :],
                        src_dict.pad()).tolist()
                ])
                # Process top predictions
                for i, hypo in enumerate(
                        hypos[i][:min(len(hypos), args.nbest)]):
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str=src_str,
                        alignment=hypo['alignment'].int().cpu()
                        if hypo['alignment'] is not None else None,
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )

                    trans_results.append((sample_id, hypo_str))
                    if not args.quiet:
                        print('H-{}\t{}\t{}'.format(sample_id, hypo['score'],
                                                    hypo_str))
                        print('P-{}\t{}'.format(
                            sample_id, ' '.join(
                                map(
                                    lambda x: '{:.4f}'.format(x),
                                    hypo['positional_scores'].tolist(),
                                ))))
                        if args.print_alignment:
                            print('A-{}\t{}'.format(
                                sample_id, ' '.join(
                                    map(lambda x: str(utils.item(x)),
                                        alignment))))

                    # Score only the top hypothesis
                    if has_target and i == 0:
                        if align_dict is not None or args.remove_bpe is not None:
                            # Convert back to tokens for evaluation with unk replacement and/or without BPE
                            target_tokens = tgt_dict.encode_line(
                                target_str, add_if_not_exist=True)
                        if hasattr(scorer, 'add_string'):
                            scorer.add_string(target_str, hypo_str)
                        else:
                            scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(num_generated_tokens)
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += sample['nsentences']

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.result_string()))

    trans_results.sort(key=lambda key: key[0])
    print("saving translation result to {}...".format(args.output))
    with open(args.output, "w", encoding="utf-8") as w:
        for item in trans_results:
            w.write("{}\n".format(item[1].replace("<<unk>>", "")))
    select_retrieve_tokens.sort(key=lambda key: key[0])
    orig_retrieve_tokens_length = 0
    select_retrieve_tokens_length = 0
    correct_tokens = 0
    with open(args.output + ".select", "w", encoding="utf-8") as w_select:
        for item in select_retrieve_tokens:
            sample_id, src_str, target_str, sample_predict_ground_truth, sample_orig_id, sample_select_retrieve_id, sample_orig_retrieve_tokens, sample_select_retrieve_tokens = item
            retrieve_str = src_dict.string(sample_orig_retrieve_tokens,
                                           args.remove_bpe)
            select_str = src_dict.string(sample_select_retrieve_tokens,
                                         args.remove_bpe)
            w_select.write("{}\n{}\n{}\n{}\n\n".format(src_str, target_str,
                                                       retrieve_str,
                                                       select_str))
            orig_retrieve_tokens_length += len(sample_orig_retrieve_tokens)
            select_retrieve_tokens_length += len(sample_select_retrieve_tokens)
            #calculate accuracy
            correct_tokens += (
                (sample_select_retrieve_id != _model_args.PAD_ID
                 ).long() == sample_predict_ground_truth).masked_fill(
                     (sample_orig_id == _model_args.PAD_ID).byte(), 0).sum()

    ratio = select_retrieve_tokens_length / float(orig_retrieve_tokens_length)
    accuracy = correct_tokens.tolist() / float(orig_retrieve_tokens_length)
    print("Selective Tokens: {}".format(ratio))
    print("Correct Tokens: {}".format(accuracy))

    with open("{}.RetrieveNMT.BLEU".format(args.output), "a",
              encoding="utf-8") as w:
        w.write(
            '{}->{}: Generate {} with beam={} and lenpen={}: {};\tSelection Ratio: {};\tAccuracy:{}\n'
            .format(args.source_lang, args.target_lang,
                    args.gen_subset, args.beam, args.lenpen,
                    scorer.result_string(), ratio, accuracy))

    return scorer
コード例 #11
0
 def __init__(self, args, src_dict, dst_dict):
     super().__init__(args, src_dict, dst_dict)
     self.translator = None
     self.scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                               dst_dict.unk())
コード例 #12
0
def main():
    parser = options.get_parser('Generation')
    parser.add_argument('--path',
                        metavar='FILE',
                        required=True,
                        action='append',
                        help='path(s) to model file(s)')
    dataset_args = options.add_dataset_args(parser)
    dataset_args.add_argument('--batch-size',
                              default=32,
                              type=int,
                              metavar='N',
                              help='batch size')
    dataset_args.add_argument(
        '--gen-subset',
        default='test',
        metavar='SPLIT',
        help='data subset to generate (train, valid, test)')
    options.add_generation_args(parser)

    args = parser.parse_args()
    if args.no_progress_bar and args.log_format is None:
        args.log_format = 'none'
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset
    if args.replace_unk is None:
        dataset = data.load_dataset(args.data, [args.gen_subset],
                                    args.source_lang, args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, [args.gen_subset],
                                             args.source_lang,
                                             args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    # Load ensemble
    print('| loading model(s) from {}'.format(', '.join(args.path)))
    models, _ = utils.load_ensemble_for_inference(args.path, dataset.src_dict,
                                                  dataset.dst_dict)

    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    print('| {} {} {} examples'.format(args.data, args.gen_subset,
                                       len(dataset.splits[args.gen_subset])))

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam)

    # Initialize generator
    translator = SequenceGenerator(models,
                                   beam_size=args.beam,
                                   stop_early=(not args.no_early_stop),
                                   normalize_scores=(not args.unnormalized),
                                   len_penalty=args.lenpen,
                                   unk_penalty=args.unkpen)
    if use_cuda:
        translator.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Generate and compute BLEU score
    scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos(),
                         dataset.dst_dict.unk())
    max_positions = min(model.max_encoder_positions() for model in models)
    itr = dataset.eval_dataloader(args.gen_subset,
                                  max_sentences=args.batch_size,
                                  max_positions=max_positions,
                                  skip_invalid_size_inputs_valid_test=args.
                                  skip_invalid_size_inputs_valid_test)
    num_sentences = 0
    with utils.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda_device=0 if use_cuda else None,
            timer=gen_timer)
        for sample_id, src_tokens, target_tokens, hypos in translations:
            # Process input and ground truth
            target_tokens = target_tokens.int().cpu()
            # Either retrieve the original sentences or regenerate them from tokens.
            if align_dict is not None:
                src_str = dataset.splits[
                    args.gen_subset].src.get_original_text(sample_id)
                target_str = dataset.splits[
                    args.gen_subset].dst.get_original_text(sample_id)
            else:
                src_str = dataset.src_dict.string(src_tokens, args.remove_bpe)
                target_str = dataset.dst_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

            if not args.quiet:
                print('S-{}\t{}'.format(sample_id, src_str))
                print('T-{}\t{}'.format(sample_id, target_str))

            # Process top predictions
            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'].int().cpu(),
                    align_dict=align_dict,
                    dst_dict=dataset.dst_dict,
                    remove_bpe=args.remove_bpe)

                if not args.quiet:
                    print('H-{}\t{}\t{}'.format(sample_id, hypo['score'],
                                                hypo_str))
                    print('A-{}\t{}'.format(sample_id,
                                            ' '.join(map(str, alignment))))

                # Score only the top hypothesis
                if i == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tokenizer.Tokenizer.tokenize(
                            target_str,
                            dataset.dst_dict,
                            add_if_not_exist=True)
                    scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.
          format(num_sentences, gen_timer.n, gen_timer.sum,
                 1. / gen_timer.avg))
    print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam,
                                                  scorer.result_string()))
コード例 #13
0
def score(args, trainer, task, epoch_itr, subset):

    begin = time.time()

    if not subset in task.datasets.keys():
        task.load_dataset(subset)

    src_dict = deepcopy(task.source_dictionary
                        )  # This is necessary, generation of translations
    tgt_dict = deepcopy(
        task.target_dictionary
    )  # alters target dictionary messing up with the rest of training

    model = trainer.get_model()

    # Initialize data iterator
    itr = data.EpochBatchIterator(
        dataset=task.dataset(subset),
        max_tokens=None,
        max_sentences=max(
            8, min(math.ceil(1024 / args.distributed_world_size), 128)),
        max_positions=model.max_positions(),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    translator = SequenceGenerator(
        [model],
        tgt_dict,
        beam_size=args.beam,
        stop_early=(not args.no_early_stop),
        normalize_scores=(not args.unnormalized),
        len_penalty=args.lenpen,
        unk_penalty=args.unkpen,
        sampling=args.sampling,
        sampling_topk=args.sampling_topk,
        minlen=args.min_len,
    )
    # Generate and compute BLEU
    dict = dictionary.Dictionary()
    scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
    num_sentences = 0
    has_target = True
    predictions = []
    with progress_bar.build_progress_bar(args, itr) as progress:
        translations = translator.generate_batched_itr(
            progress,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=True,
            timer=gen_timer,
            prefix_size=args.prefix_size,
        )

        wps_meter = TimeMeter()
        for sample_id, src_tokens, target_tokens, hypos in translations:
            # Process input and grount truth
            has_target = target_tokens is not None
            target_tokens = target_tokens.int().cpu() if has_target else None

            src_str = src_dict.string(src_tokens, args.remove_bpe)
            if has_target:
                target_str = tgt_dict.string(target_tokens,
                                             args.remove_bpe,
                                             escape_unk=True)

            # Process top predictions
            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'].int().cpu()
                    if hypo['alignment'] is not None else None,
                    align_dict=None,
                    tgt_dict=tgt_dict,
                    remove_bpe=args.remove_bpe)

                # Score only the top hypothesis
                if has_target and i == 0:
                    if args.sentencepiece:
                        hypo_str = hypo_str.replace(' ', '').replace('▁', ' ')
                        target_str = target_str.replace(' ',
                                                        '').replace('▁', ' ')
                    sys_tok = tokenizer.Tokenizer.tokenize(
                        (hypo_str.lower() if args.ignore_case else hypo_str),
                        dict)
                    ref_tok = tokenizer.Tokenizer.tokenize(
                        (target_str.lower()
                         if args.ignore_case else target_str), dict)
                    scorer.add(ref_tok, sys_tok)
                    if not args.sentencepiece:
                        hypo_str = tokenizer.Tokenizer.detokenize(
                            hypo_str, 'de')
                    predictions.append('{}\t{}'.format(sample_id, hypo_str))

            wps_meter.update(src_tokens.size(0))
            progress.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    if args.distributed_world_size > 1:
        _all_gather_bleu_scorer(scorer)
        predictions = _all_gather_predictions(predictions)

    with open(os.path.join(args.data, 'sacrebleu_reference.de'),
              'r') as reference:
        refs = [reference.readlines()]
    #reducing indexed predictions as strings is more memory efficient than reducing tuples
    predictions = [tuple(item.split('\t')) for item in predictions]
    predictions = [(int(item[0]), item[1]) for item in predictions]
    predictions.sort(key=lambda tup: tup[0])
    predictions = [
        hypo[1] + ('\n' if hypo[1][-1] != '\n' else '') for hypo in predictions
    ]
    sacrebleu_score = sacrebleu.corpus_bleu(predictions,
                                            refs,
                                            lowercase=args.ignore_case)
    print(f'|Detokenized {sacrebleu_score}')
    if gen_timer.sum != 0:
        print(
            '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
            .format(num_sentences, gen_timer.n, gen_timer.sum,
                    num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(subset, args.beam,
                                                      scorer.result_string()))

    print('| Eval completed in: {:.2f}s'.format(time.time() - begin))

    return scorer.score(order=4), sacrebleu_score.score
コード例 #14
0
def _main(args, output_file):
    logging.basicConfig(
        format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        level=logging.INFO,
        stream=output_file,
    )
    logger = logging.getLogger('fairseq_cli.generate')

    utils.import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    logger.info(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    # Load ensemble
    logger.info('loading model(s) from {}'.format(args.path))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        utils.split_paths(args.path),
        arg_overrides=eval(args.model_overrides),
        task=task,
    )

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)
    progress = progress_bar.progress_bar(
        itr,
        log_format=args.log_format,
        log_interval=args.log_interval,
        default_log_format=('tqdm' if not args.no_progress_bar else 'none'),
    )

    # debug: ahmed
    def quantize(data, n, max_value=1):
        scale = ((2**(n) - 1) / 2) / torch.max(torch.abs(data))  # adaptive max
        #scale = ((2**(n)-1)/2)/max_value # static max (predetermined)
        return torch.round(scale * data) / scale

    # quantize model layer by layer to n-bit
    #print("#########################################")
    for name, param in model.named_parameters():
        if param.requires_grad and ('weight' in name):
            layer = 'model.' + name
    #fileName = 'model_wmt14.weights.layers'
    fileName = 'model_iwslt14.tokenized.de-en.weights.layers'
    with open(fileName) as f:
        layersList = f.readlines()
    layersNamesList = [layerName.rstrip('\n') for layerName in layersList]
    layer_max_dict = pickle.load(open("layer_max_dict.pkl", "rb"))
    n = 8  #PRANNOY (type=int)
    for layer in layersNamesList:
        print('----------')
        #print(model.encoder.layers[0].self_attn)

        print(layer)
        kernel = eval(layer)
        max_value = layer_max_dict[layer].item()
        kernel_q = quantize(kernel, n)  # adaptive (on the fly)
        #kernel_q = quantize(kernel, 8, max_value) # static
        exec(layer + '=' + 'torch.nn.Parameter(kernel_q)')
        print(len((eval(layer)).unique()))
    """ 
    # quantize model layer by layer to n-bit
    print("#########################################")
    #print(model.encoder.embed_tokens.weight.shape)
    fileName = 'model_print.keys.weights.layers'
    with open(fileName) as f:
        layersList = f.readlines()
    layersNamesList = [layerName.rstrip('\n') for layerName in layersList]
    for layer in layersNamesList:
        #print(vars(layer).shape) 
        #print(model.encoder.embed_tokens.weight)
        #print(exec(layer))
        #print(globals()[layer]) 
        #print(eval(layer).shape) 

        print('------------')
        print(layer)
        kernel = eval(layer)
        kernel_q = quantize(kernel)
        #eval(layer) = torch.nn.Parameter(kernel_q)
        exec(layer + '=' + 'torch.nn.Parameter(kernel_q)')
        print(len((eval(layer)).unique()))
        #print(model)
        #kernel = model.decoder.layers[3].fc1.weight
        #print(kernel.shape)
        #print(torch.max(torch.abs(kernel)))
        #print(kernel[0][0:3])
        #print(len(set(model.decoder.layers[3].fc1.weight)))
        #kernel_q = quantize(kernel)
        #print(kernel_q[0][0:3])
        #model.decoder.layers[3].fc1.weight = torch.nn.Parameter(kernel_q)
        #print(len((model.decoder.layers[3].fc1.weight).unique()))
    print("#########################################")
    """

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(models, args)

    # Handle tokenization and BPE
    tokenizer = encoders.build_tokenizer(args)
    bpe = encoders.build_bpe(args)

    def decode_fn(x):
        if bpe is not None:
            x = bpe.decode(x)
        if tokenizer is not None:
            x = tokenizer.decode(x)
        return x

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    wps_meter = TimeMeter()
    for sample in progress:
        sample = utils.move_to_cuda(sample) if use_cuda else sample
        if 'net_input' not in sample:
            continue

        prefix_tokens = None
        if args.prefix_size > 0:
            prefix_tokens = sample['target'][:, :args.prefix_size]

        gen_timer.start()
        hypos = task.inference_step(generator, models, sample, prefix_tokens)
        num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
        gen_timer.stop(num_generated_tokens)

        for i, sample_id in enumerate(sample['id'].tolist()):
            has_target = sample['target'] is not None

            # Remove padding
            src_tokens = utils.strip_pad(
                sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
            target_tokens = None
            if has_target:
                target_tokens = utils.strip_pad(sample['target'][i, :],
                                                tgt_dict.pad()).int().cpu()

            # Either retrieve the original sentences or regenerate them from tokens.
            if align_dict is not None:
                src_str = task.dataset(
                    args.gen_subset).src.get_original_text(sample_id)
                target_str = task.dataset(
                    args.gen_subset).tgt.get_original_text(sample_id)
            else:
                if src_dict is not None:
                    src_str = src_dict.string(src_tokens, args.remove_bpe)
                else:
                    src_str = ""
                if has_target:
                    target_str = tgt_dict.string(target_tokens,
                                                 args.remove_bpe,
                                                 escape_unk=True,
                                                 extra_symbols_to_ignore={
                                                     generator.eos,
                                                 })

            src_str = decode_fn(src_str)
            if has_target:
                target_str = decode_fn(target_str)

            if not args.quiet:
                if src_dict is not None:
                    print('S-{}\t{}'.format(sample_id, src_str),
                          file=output_file)
                if has_target:
                    print('T-{}\t{}'.format(sample_id, target_str),
                          file=output_file)

            # Process top predictions
            for j, hypo in enumerate(hypos[i][:args.nbest]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'],
                    align_dict=align_dict,
                    tgt_dict=tgt_dict,
                    remove_bpe=args.remove_bpe,
                    extra_symbols_to_ignore={
                        generator.eos,
                    })
                detok_hypo_str = decode_fn(hypo_str)
                if not args.quiet:
                    score = hypo['score'] / math.log(2)  # convert to base 2
                    # original hypothesis (after tokenization and BPE)
                    print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str),
                          file=output_file)
                    # detokenized hypothesis
                    print('D-{}\t{}\t{}'.format(sample_id, score,
                                                detok_hypo_str),
                          file=output_file)
                    print(
                        'P-{}\t{}'.format(
                            sample_id,
                            ' '.join(
                                map(
                                    lambda x: '{:.4f}'.format(x),
                                    # convert from base e to base 2
                                    hypo['positional_scores'].div_(math.log(2)
                                                                   ).tolist(),
                                ))),
                        file=output_file)

                    if args.print_alignment:
                        print('A-{}\t{}'.format(
                            sample_id, ' '.join([
                                '{}-{}'.format(src_idx, tgt_idx)
                                for src_idx, tgt_idx in alignment
                            ])),
                              file=output_file)

                    if args.print_step:
                        print('I-{}\t{}'.format(sample_id, hypo['steps']),
                              file=output_file)

                    if getattr(args, 'retain_iter_history', False):
                        for step, h in enumerate(hypo['history']):
                            _, h_str, _ = utils.post_process_prediction(
                                hypo_tokens=h['tokens'].int().cpu(),
                                src_str=src_str,
                                alignment=None,
                                align_dict=None,
                                tgt_dict=tgt_dict,
                                remove_bpe=None,
                            )
                            print('E-{}_{}\t{}'.format(sample_id, step, h_str),
                                  file=output_file)

                # Score only the top hypothesis
                if has_target and j == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tgt_dict.encode_line(
                            target_str, add_if_not_exist=True)
                        hypo_tokens = tgt_dict.encode_line(
                            detok_hypo_str, add_if_not_exist=True)
                    if hasattr(scorer, 'add_string'):
                        scorer.add_string(target_str, detok_hypo_str)
                    else:
                        scorer.add(target_tokens, hypo_tokens)

        wps_meter.update(num_generated_tokens)
        progress.log({'wps': round(wps_meter.avg)})
        num_sentences += sample['nsentences']

    logger.info('NOTE: hypothesis and token scores are output in base 2')
    logger.info(
        'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        if args.bpe and not args.sacrebleu:
            if args.remove_bpe:
                logger.warning(
                    "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization"
                )
            else:
                logger.warning(
                    "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words.  Use --sacrebleu for standard 13a BLEU tokenization"
                )
        logger.info('Generate {} with beam={}: {}'.format(
            args.gen_subset, args.beam, scorer.result_string()))
        # ahmed: logging
        with open("infer_BLEU.txt", "a") as myfile:
            myfile.write(scorer.result_string())
            myfile.write("\n")

    return scorer
コード例 #15
0
def _main(args, output_file):
    logging.basicConfig(
        format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        level=logging.INFO,
        stream=output_file,
    )
    logger = logging.getLogger('fairseq_cli.generate')

    utils.import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    logger.info(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    # Load ensemble
    logger.info('loading model(s) from {}'.format(args.path))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        utils.split_paths(args.path),
        arg_overrides=eval(args.model_overrides),
        task=task,
    )

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)
    progress = progress_bar.progress_bar(
        itr,
        log_format=args.log_format,
        log_interval=args.log_interval,
        default_log_format=('tqdm' if not args.no_progress_bar else 'none'),
    )

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    wps_meter = TimeMeter()
    for sample in progress:
        sample = utils.move_to_cuda(sample) if use_cuda else sample
        if 'net_input' not in sample:
            continue

        prefix_tokens = None
        if args.prefix_size > 0:
            prefix_tokens = sample['target'][:, :args.prefix_size]

        gen_timer.start()
        hypos = task.inference_step(generator, models, sample, prefix_tokens)
        num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
        gen_timer.stop(num_generated_tokens)

        for i, sample_id in enumerate(sample['id'].tolist()):
            has_target = sample['target'] is not None

            # Remove padding
            src_tokens = utils.strip_pad(
                sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
            target_tokens = None
            if has_target:
                target_tokens = utils.strip_pad(sample['target'][i, :],
                                                tgt_dict.pad()).int().cpu()

            # Either retrieve the original sentences or regenerate them from tokens.
            if align_dict is not None:
                src_str = task.dataset(
                    args.gen_subset).src.get_original_text(sample_id)
                target_str = task.dataset(
                    args.gen_subset).tgt.get_original_text(sample_id)
            else:
                if src_dict is not None:
                    src_str = src_dict.string(src_tokens, args.remove_bpe)
                else:
                    src_str = ""
                if has_target:
                    target_str = tgt_dict.string(target_tokens,
                                                 args.remove_bpe,
                                                 escape_unk=True)

            if not args.quiet:
                if src_dict is not None:
                    print('S-{}\t{}'.format(sample_id, src_str),
                          file=output_file)
                if has_target:
                    print('T-{}\t{}'.format(sample_id, target_str),
                          file=output_file)

            # Process top predictions
            for j, hypo in enumerate(hypos[i][:args.nbest]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'],
                    align_dict=align_dict,
                    tgt_dict=tgt_dict,
                    remove_bpe=args.remove_bpe,
                )

                if not args.quiet:
                    score = hypo['score'] / math.log(2)  # convert to base 2
                    print('H-{}\t{}\t{}'.format(sample_id, score, hypo_str),
                          file=output_file)
                    print(
                        'P-{}\t{}'.format(
                            sample_id,
                            ' '.join(
                                map(
                                    lambda x: '{:.4f}'.format(x),
                                    # convert from base e to base 2
                                    hypo['positional_scores'].div_(math.log(2)
                                                                   ).tolist(),
                                ))),
                        file=output_file)

                    if args.print_alignment:
                        print('A-{}\t{}'.format(
                            sample_id, ' '.join([
                                '{}-{}'.format(src_idx, tgt_idx)
                                for src_idx, tgt_idx in alignment
                            ])),
                              file=output_file)

                    if args.print_step:
                        print('I-{}\t{}'.format(sample_id, hypo['steps']),
                              file=output_file)
                        print('O-{}\t{}'.format(sample_id, hypo['num_ops']),
                              file=output_file)

                    if getattr(args, 'retain_iter_history', False):
                        for step, h in enumerate(hypo['history']):
                            _, h_str, _ = utils.post_process_prediction(
                                hypo_tokens=h['tokens'].int().cpu(),
                                src_str=src_str,
                                alignment=None,
                                align_dict=None,
                                tgt_dict=tgt_dict,
                                remove_bpe=None,
                            )
                            print('E-{}_{}\t{}'.format(sample_id, step, h_str),
                                  file=output_file)

                # Score only the top hypothesis
                if has_target and j == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tgt_dict.encode_line(
                            target_str, add_if_not_exist=True)
                    if hasattr(scorer, 'add_string'):
                        scorer.add_string(target_str, hypo_str)
                    else:
                        scorer.add(target_tokens, hypo_tokens)

        wps_meter.update(num_generated_tokens)
        progress.log({'wps': round(wps_meter.avg)})
        num_sentences += sample['nsentences']

    logger.info('NOTE: hypothesis and token scores are output in base 2')
    logger.info(
        'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        logger.info('Generate {} with beam={}: {}'.format(
            args.gen_subset, args.beam, scorer.result_string()))

    return scorer
コード例 #16
0
def _generate_score(models, args, task, dataset_split, optimize=True):
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    if not args.quiet:
        print("| loading model(s) from {}".format(", ".join(args.path)))

    # Optimize ensemble for generation
    if optimize:
        for model in models:
            model.make_generation_fast_(beamable_mm_beam_size=None if args.
                                        no_beamable_mm else args.beam)

    translator = build_sequence_generator(args, task, models)
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Keep track of translations
    # Initialize with empty translations
    # and zero probs scores
    translated_sentences = [""] * len(task.dataset(dataset_split))
    translated_scores = [0.0] * len(task.dataset(dataset_split))

    # Generate and compute BLEU score
    dst_dict = task.target_dictionary
    scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())
    itr = get_eval_itr(args, models, task, dataset_split)

    num_sentences = 0
    translation_samples = []
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        # Keep more detailed timing when invoked from benchmark
        if "keep_detailed_timing" in args:
            gen_timer = pytorch_translate_utils.BucketStopwatchMeter(
                args.increment, args.max_length, args.samples_per_length)
        else:
            gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=use_cuda,
            timer=gen_timer,
            prefix_size=1
            if pytorch_translate_data.is_multilingual(args) else 0,
        )
        if pytorch_translate_data.is_multilingual(args):
            first_best_translations = _iter_first_best_multilingual
        else:
            first_best_translations = _iter_first_best_bilingual
        for trans_info in first_best_translations(args, task, dataset_split,
                                                  translations, align_dict):
            scorer.add(trans_info.target_tokens, trans_info.hypo_tokens)
            translated_sentences[trans_info.sample_id] = trans_info.hypo_str
            translated_scores[trans_info.sample_id] = trans_info.hypo_score
            translation_samples.append(
                collections.OrderedDict({
                    "sample_id": trans_info.sample_id,
                    "src_str": trans_info.src_str,
                    "target_str": trans_info.target_str,
                    "hypo_str": trans_info.hypo_str,
                }))
            wps_meter.update(trans_info.src_tokens.size(0))
            t.log({"wps": round(wps_meter.avg)})
            num_sentences += 1

    # If applicable, save the translations to the output file
    # For eg. external evaluation
    if getattr(args, "translation_output_file", False):
        with open(args.translation_output_file, "w") as out_file:
            for hypo_str in translated_sentences:
                print(hypo_str, file=out_file)

    if getattr(args, "translation_probs_file", False):
        with open(args.translation_probs_file, "w") as out_file:
            for hypo_score in translated_scores:
                print(np.exp(hypo_score), file=out_file)

    return scorer, num_sentences, gen_timer, translation_samples
コード例 #17
0
def generate_main(args, model):
    models = [model]
    result_writer = open(args.results_path, "w", encoding="utf-8")

    #assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    utils.import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    #print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    # Load ensemble
    #print('| loading model(s) from {}'.format(args.path))
    #models, _model_args = checkpoint_utils.load_model_ensemble(
    #    args.path.split(':'),
    #    arg_overrides=eval(args.model_overrides),
    #    task=task,
    #)

    # Optimize ensemble for generation
    #for model in models:
    #    model.make_generation_fast_(
    #        beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
    #        need_attn=args.print_alignment,
    #    )
    #    if args.fp16:
    #        model.half()
    #    if use_cuda:
    #        model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        for sample in t:
            sample = utils.move_to_cuda(sample) if use_cuda else sample
            if 'net_input' not in sample:
                continue

            prefix_tokens = None
            if args.prefix_size > 0:
                prefix_tokens = sample['target'][:, :args.prefix_size]

            gen_timer.start()
            hypos = task.inference_step(generator, models, sample,
                                        prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            for i, sample_id in enumerate(sample['id'].tolist()):
                has_target = sample['target'] is not None

                # Remove padding
                src_tokens = utils.strip_pad(
                    sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()

                # Either retrieve the original sentences or regenerate them from tokens.
                if align_dict is not None:
                    src_str = task.dataset(
                        args.gen_subset).src.get_original_text(sample_id)
                    target_str = task.dataset(
                        args.gen_subset).tgt.get_original_text(sample_id)
                else:
                    if src_dict is not None:
                        src_str = src_dict.string(src_tokens, args.remove_bpe)
                    else:
                        src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

                if not args.quiet:
                    if src_dict is not None:
                        print('S-{}\t{}'.format(sample_id, src_str),
                              file=result_writer)
                    if has_target:
                        print('T-{}\t{}'.format(sample_id, target_str),
                              file=result_writer)

                # Process top predictions
                for j, hypo in enumerate(hypos[i][:args.nbest]):
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str=src_str,
                        alignment=hypo['alignment'].int().cpu()
                        if hypo['alignment'] is not None else None,
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )

                    if not args.quiet:
                        print('H-{}\t{}\t{}'.format(sample_id, hypo['score'],
                                                    hypo_str),
                              file=result_writer)
                        print('P-{}\t{}'.format(
                            sample_id, ' '.join(
                                map(
                                    lambda x: '{:.4f}'.format(x),
                                    hypo['positional_scores'].tolist(),
                                ))),
                              file=result_writer)

                        if args.print_alignment:
                            print('A-{}\t{}'.format(
                                sample_id, ' '.join(
                                    map(lambda x: str(utils.item(x)),
                                        alignment))),
                                  file=result_writer)

                    # Score only the top hypothesis
                    if has_target and j == 0:
                        if align_dict is not None or args.remove_bpe is not None:
                            # Convert back to tokens for evaluation with unk replacement and/or without BPE
                            target_tokens = tgt_dict.encode_line(
                                target_str, add_if_not_exist=True)
                        if hasattr(scorer, 'add_string'):
                            scorer.add_string(target_str, hypo_str)
                        else:
                            scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(num_generated_tokens)
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += sample['nsentences']

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.result_string()))
    return scorer
コード例 #18
0
def main(args):
    logging.basicConfig(
        format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        level=os.environ.get('LOGLEVEL', 'INFO').upper(),
        stream=sys.stdout,
    )
    logger = logging.getLogger('fairseq_cli.generate')

    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    utils.import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    #print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    # Load ensemble
    logger.info('loading model(s) from {}'.format(args.path))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        args.path.split(':'),
        arg_overrides=eval(args.model_overrides),
        task=task,
    )

    mose_de = MosesDetokenizer(lang='en')
    if args.bert_model_path:
        bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model_path)
        bert_model = BertModel.from_pretrained(args.bert_model_path)
        bert_model.cuda()
        bert_model.eval()
    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)
    batch_len = len(task.dataset(args.gen_subset))

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    #Bert model
    #bert_model = bert_as_lm.Bert_score(torch.cuda.current_device())
    #mose_de = MosesDetokenizer(lang='en')
    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True

    total_pearson = []
    total_bert_pearson = []
    random_list = []
    bert_bleu_equal = 0
    sents_num = 0
    if args.gen_subset == 'train':
        random_list = [i for i in range(0, batch_len)]
        random.shuffle(random_list)
        random_list = random_list[:1000]

    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        for sample in t:
            selected = []
            if random_list:
                for i in sample['id']:
                    selected.append(
                        True) if i in random_list else selected.append(False)
                selected = torch.nonzero(
                    torch.tensor(selected).ne(0)).squeeze(-1)
                if len(selected) == 0:
                    continue
                for item in sample.keys():
                    if item == 'nsentences' or item == 'ntokens':
                        continue
                    elif item == 'net_input':
                        for input in sample[item].keys():
                            sample[item][input] = sample[item][
                                input].index_select(0, selected)
                    else:
                        sample[item] = sample[item].index_select(0, selected)
                sample['nsentences'] = len(selected)
                sample['ntokens'] = torch.LongTensor([
                    s.ne(2).long().sum() for s in sample['target']
                ]).sum().item()

            sample = utils.move_to_cuda(sample) if use_cuda else sample
            if 'net_input' not in sample:
                continue

            prefix_tokens = None
            if args.prefix_size > 0:
                prefix_tokens = sample['target'][:, :args.prefix_size]

            gen_timer.start()
            hypos = task.inference_step(generator, models, sample,
                                        prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            for i, sample_id in enumerate(sample['id'].tolist()):
                sents_num += 1
                if random_list and sample_id not in random_list:
                    continue
                has_target = sample['target'] is not None

                # Remove padding
                src_tokens = utils.strip_pad(
                    sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()

                # Either retrieve the original sentences or regenerate them from tokens.
                if align_dict is not None:
                    src_str = task.dataset(
                        args.gen_subset).src.get_original_text(sample_id)
                    target_str = task.dataset(
                        args.gen_subset).tgt.get_original_text(sample_id)
                else:
                    if src_dict is not None:
                        src_str = src_dict.string(src_tokens, args.remove_bpe)
                    else:
                        src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

                if not args.quiet:
                    print("---------------{}--------------".format(sents_num))
                    if src_dict is not None:
                        print('S-{}\t{}'.format(sample_id, src_str),
                              file=sys.stdout)
                    if has_target:
                        print('T-{}\t{}'.format(sample_id, target_str),
                              file=sys.stdout)

                # Process top predictions
                probs = []
                bleu_score = []
                cands = []
                sents_bert_score = []
                detoken_cands = []
                temp_cand_tokens = []

                for j, hypo in enumerate(hypos[i][:args.nbest]):
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str=src_str,
                        alignment=hypo['alignment'],
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )

                    if not args.quiet:
                        score = hypo['score'] / math.log(
                            2)  # convert to base 2
                    probs.append(score)
                    single_score = sacrebleu.corpus_bleu(
                        [hypo_str], [[target_str]],
                        use_effective_order=True,
                        tokenize="none")
                    bleu_score.append(single_score.score)
                    cands.append(hypo_str)
                    temp_cand_tokens.append(hypo['tokens'].cpu())

                    hypo_splittokens = hypo_str.split(' ')
                    detoken_hypo = mose_de.detokenize(hypo_splittokens)
                    detoken_cands.append(detoken_hypo)

                #print (cands)
                #get the decoded_out
                current_src_length = sample['net_input']['src_lengths'][i]
                current_src_tokens = sample['net_input']['src_tokens'][i, :]
                decoded_out = get_decoded_out(current_src_tokens,
                                              current_src_length,
                                              temp_cand_tokens, models)
                #print (decoded_out.shape,"decoded")
                #get the encoded_out of bert
                encoded_input = bert_tokenizer(detoken_cands,
                                               return_tensors='pt',
                                               padding=True)
                for key in encoded_input:
                    encoded_input[key] = encoded_input[key].cuda()
                bert_output = bert_model(**encoded_input)[0]
                #print (bert_output.shape)
                encoded_out = bert_output.mean(dim=1)
                #print ("encoded_out",encoded_out.shape)

                net_out = models[0].decoder.through_ffnet(
                    torch.cat((encoded_out, decoded_out),
                              1))  #[beam, seq_len, 1]
                #print ("net_out", net_out.shape)
                sent_logits = torch.nn.functional.softmax(net_out.view(
                    -1, args.nbest),
                                                          dim=1)
                #print (sent_logits)
                net_pos = torch.argmax(sent_logits).item()
                #print (net_pos)

                pearson = 0
                #if args.nbest > 20:
                np_prob = np.array(probs)
                np_bleu = np.array(bleu_score)
                pearson = np.corrcoef(np_prob, np_bleu)[0][1]

                if not np.isnan(pearson):
                    total_pearson.append(pearson)
                #else:
                #    print ("cands:", cands, file=sys.stdout)
                #    print ("probs:", np_prob, file=sys.stdout)
                #    print ("bleus:", np_bleu, file=sys.stdout)

                bleu_pos = bleu_score.index(max(bleu_score))
                print("-----bleu choice: {} bleu:{:.3f}  pos: {}".format(
                    cands[bleu_pos], bleu_score[bleu_pos], bleu_pos + 1),
                      file=sys.stdout)
                pos = probs.index(max(probs))
                print("-----prob choice: {} bleu:{:.3f} pos: {}".format(
                    cands[pos], bleu_score[pos], pos + 1),
                      file=sys.stdout)
                print("-----net choice: {} bleu:{:.3f} pos: {}".format(
                    cands[net_pos], bleu_score[net_pos], net_pos + 1),
                      file=sys.stdout)
                '''
                np_bert = np.array(sents_bert_score)
                bert_bleu_pearson = np.corrcoef(np_bert, np_bleu)[0][1]
                if not np.isnan(bert_bleu_pearson):
                   total_bert_pearson.append(bert_bleu_pearson) 

                bert_pos = sents_bert_score.index(min(sents_bert_score))
                print('*****{} bert choice: {}\tprob:{:.3f}\tbleu:{:.3f}\tbertscore:{:.3f}\tposition:{}\tprob_bleu_pearson:{:.3f} bert_bleu_p: {:.3f} '. \
                format(sample_id, cands[bert_pos], probs[bert_pos], bleu_score[bert_pos],
                    sents_bert_score[bert_pos], bert_pos+1, pearson, bert_bleu_pearson), file=sys.stdout)
                '''
                if args.usebleu:
                    final_hypo = cands[bleu_pos]
                elif args.usebert:
                    final_hypo = cands[net_pos]
                else:
                    final_hypo = cands[pos]
                scorer.add_string(target_str, final_hypo)

                print('H choice use bleu: {} usebert: {}'.format(
                    args.usebleu, args.usebert))
                if has_target and sents_num % 800 == 0:
                    print('Generate {} with beam={}: {}\t{}'.format(
                        args.gen_subset,
                        args.beam,
                        scorer.result_string(),
                        sents_num,
                        file=sys.stdout))
            wps_meter.update(num_generated_tokens)
            #t.log({'wps': round(wps_meter.avg)})
            num_sentences += sample['nsentences']

    logger.info(
        'Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print(
            'Generate {} with beam={}: {}\n --- prob&bleu pearson: {:.4f} ---'.
            format(args.gen_subset, args.beam, scorer.result_string(),
                   sum(total_pearson) / len(total_pearson)),
            file=sys.stdout)

    return scorer
コード例 #19
0
def validate_translation(args, trainer, task, epoch_itr, generator):
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary
    models = [trainer.get_model()]
    if hasattr(task, 'eval_lang_pairs'):
        bleu_dict = {key: None for key in task.eval_lang_pairs}

        # Generate and compute BLEU score
        if args.sacrebleu:
            scorer_dict = {
                key: bleu.SacrebleuScorer()
                for key in task.eval_lang_pairs
            }
        else:
            scorer_dict = {
                key: bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(),
                                 tgt_dict.unk())
                for key in task.eval_lang_pairs
            }

        itr = task.get_batch_iterator(
            dataset=task.dataset('valid'),
            max_tokens=args.max_tokens_valid,
            max_sentences=args.max_sentences_valid,
            max_positions=utils.resolve_max_positions(
                task.max_positions(),
                trainer.get_model().max_positions(),
            ),
            ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
            required_batch_size_multiple=args.required_batch_size_multiple,
            seed=args.seed,
            num_shards=args.distributed_world_size,
            shard_id=args.distributed_rank,
            num_workers=args.num_workers,
            noskip=True,
        )[0].next_epoch_itr(shuffle=False)
        progress = progress_bar.build_progress_bar(args,
                                                   itr,
                                                   epoch_itr.epoch,
                                                   prefix='translate subset',
                                                   no_progress_bar='simple')

        num_sentences = 0
        has_target = True
        #with progress_bar.build_progress_bar(args, itr) as t:
        for samples in progress:
            if torch.cuda.is_available() and not args.cpu:
                samples = utils.move_to_cuda(samples)
            #if 'net_input' not in samples:
            #    continue

            prefix_tokens = None
            for key, sample in samples.items():
                hypos = task.inference_step(generator, models, sample,
                                            prefix_tokens)
                num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)

                for i, sample_id in enumerate(sample['id'].tolist()):
                    has_target = sample['target'] is not None

                    target_tokens = None
                    if has_target:
                        target_tokens = utils.strip_pad(
                            sample['target'][i, :],
                            tgt_dict.pad()).int().cpu()
                    # Remove padding
                    if args.sde:
                        src_tokens = target_tokens
                    else:
                        src_tokens = utils.strip_pad(
                            sample['net_input']['src_tokens'][i, :],
                            tgt_dict.pad())

                    # Either retrieve the original sentences or regenerate them from tokens.
                    #if src_dict is not None:
                    #    src_str = src_dict.string(src_tokens, args.remove_bpe)
                    #else:
                    #    src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

                    #if not args.quiet:
                    #    if src_dict is not None:
                    #        print('S-{}\t{}'.format(sample_id, src_str))
                    #    if has_target:
                    #        print('T-{}\t{}'.format(sample_id, target_str))

                    # Process top predictions
                    for j, hypo in enumerate(hypos[i][:args.nbest]):
                        hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                            hypo_tokens=hypo['tokens'].int().cpu(),
                            src_str="",
                            alignment=hypo['alignment'].int().cpu()
                            if hypo['alignment'] is not None else None,
                            align_dict=None,
                            tgt_dict=tgt_dict,
                            remove_bpe=args.remove_bpe,
                        )

                        #if not args.quiet:
                        #    print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str))
                        #    print('P-{}\t{}'.format(
                        #        sample_id,
                        #        ' '.join(map(
                        #            lambda x: '{:.4f}'.format(x),
                        #            hypo['positional_scores'].tolist(),
                        #        ))
                        #    ))

                        #    if args.print_alignment:
                        #        print('A-{}\t{}'.format(
                        #            sample_id,
                        #            ' '.join(map(lambda x: str(utils.item(x)), alignment))
                        #        ))
                        #print(has_target, j, hypo_str)
                        # Score only the top hypothesis
                        if has_target and j == 0:
                            if args.remove_bpe is not None:
                                # Convert back to tokens for evaluation with unk replacement and/or without BPE
                                target_tokens = tgt_dict.encode_line(
                                    target_str, add_if_not_exist=True)
                            if hasattr(scorer_dict[key], 'add_string'):
                                scorer_dict[key].add_string(
                                    target_str, hypo_str)
                            else:
                                scorer_dict[key].add(target_tokens,
                                                     hypo_tokens)

                num_sentences += sample['nsentences']
        print("|valid tranlsated {} sentences".format(num_sentences))
        for key, scorer in scorer_dict.items():
            bleu_dict[key] = scorer.score()
    else:
        bleu_dict = {0: None}

        # Generate and compute BLEU score
        if args.sacrebleu:
            scorer_dict = {0: bleu.SacrebleuScorer()}
        else:
            scorer_dict = {
                0: bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
            }

        itr = task.get_batch_iterator(
            dataset=task.dataset('valid'),
            max_tokens=args.max_tokens_valid,
            max_sentences=args.max_sentences_valid,
            max_positions=utils.resolve_max_positions(
                task.max_positions(),
                trainer.get_model().max_positions(),
            ),
            ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
            required_batch_size_multiple=args.required_batch_size_multiple,
            seed=args.seed,
            num_shards=args.distributed_world_size,
            shard_id=args.distributed_rank,
            num_workers=args.num_workers,
            noskip=True,
        )[0].next_epoch_itr(shuffle=False)
        progress = progress_bar.build_progress_bar(args,
                                                   itr,
                                                   epoch_itr.epoch,
                                                   prefix='translate subset',
                                                   no_progress_bar='simple')

        num_sentences = 0
        has_target = True
        #with progress_bar.build_progress_bar(args, itr) as t:
        for samples in progress:
            if torch.cuda.is_available() and not args.cpu:
                samples = utils.move_to_cuda(samples)
            #if 'net_input' not in samples:
            #    continue
            prefix_tokens = None
            sample = samples
            hypos = task.inference_step(generator, models, sample,
                                        prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)

            for i, sample_id in enumerate(sample['id'].tolist()):
                has_target = sample['target'] is not None

                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()
                # Remove padding
                if args.sde:
                    src_tokens = target_tokens
                else:
                    src_tokens = utils.strip_pad(
                        sample['net_input']['src_tokens'][i, :],
                        tgt_dict.pad())

                # Either retrieve the original sentences or regenerate them from tokens.
                #if src_dict is not None:
                #    src_str = src_dict.string(src_tokens, args.remove_bpe)
                #else:
                #    src_str = ""
                if has_target:
                    target_str = tgt_dict.string(target_tokens,
                                                 args.remove_bpe,
                                                 escape_unk=True)

                #if not args.quiet:
                #    if src_dict is not None:
                #        print('S-{}\t{}'.format(sample_id, src_str))
                #    if has_target:
                #        print('T-{}\t{}'.format(sample_id, target_str))

                # Process top predictions
                for j, hypo in enumerate(hypos[i][:args.nbest]):
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str="",
                        alignment=hypo['alignment'].int().cpu()
                        if hypo['alignment'] is not None else None,
                        align_dict=None,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )

                    #if not args.quiet:
                    #    print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str))
                    #    print('P-{}\t{}'.format(
                    #        sample_id,
                    #        ' '.join(map(
                    #            lambda x: '{:.4f}'.format(x),
                    #            hypo['positional_scores'].tolist(),
                    #        ))
                    #    ))

                    #    if args.print_alignment:
                    #        print('A-{}\t{}'.format(
                    #            sample_id,
                    #            ' '.join(map(lambda x: str(utils.item(x)), alignment))
                    #        ))
                    #print(has_target, j, hypo_str)
                    # Score only the top hypothesis
                    if has_target and j == 0:
                        if args.remove_bpe is not None:
                            # Convert back to tokens for evaluation with unk replacement and/or without BPE
                            target_tokens = tgt_dict.encode_line(
                                target_str, add_if_not_exist=True)
                        if hasattr(scorer_dict[0], 'add_string'):
                            scorer_dict[0].add_string(target_str, hypo_str)
                        else:
                            scorer_dict[0].add(target_tokens, hypo_tokens)

            num_sentences += sample['nsentences']
        print("|valid tranlsated {} sentences".format(num_sentences))
        for key, scorer in scorer_dict.items():
            bleu_dict[key] = scorer.score()

    return bleu_dict
コード例 #20
0
def main(args):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.dataset_impl == 'raw', \
        '--replace-unk requires a raw text dataset (--dataset-impl=raw)'

    utils.import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))

    models, _model_args = checkpoint_utils.load_model_ensemble(
        args.path.split(':'),
        arg_overrides=args.model_overrides,
        task=task,
    )

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()

        if args.decoding_path is not None:
            src_sents = [[] for _ in range(1000000)]
            tgt_sents = [[] for _ in range(1000000)]
            hyp_sents = [[] for _ in range(1000000)]

        for sample in t:
            sample = utils.move_to_cuda(sample) if use_cuda else sample
            if 'net_input' not in sample:
                continue

            prefix_tokens = None
            if args.prefix_size > 0:
                prefix_tokens = sample['target'][:, :args.prefix_size]

            gen_timer.start()
            hypos = task.inference_step(generator, models, sample,
                                        prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            for i, sample_id in enumerate(sample['id'].tolist()):
                has_target = sample['target'] is not None

                # Remove padding
                src_tokens = utils.strip_pad(
                    sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()

                # Either retrieve the original sentences or regenerate them from tokens.
                if align_dict is not None:
                    src_str = task.dataset(
                        args.gen_subset).src.get_original_text(sample_id)
                    target_str = task.dataset(
                        args.gen_subset).tgt.get_original_text(sample_id)
                else:
                    if src_dict is not None:
                        src_str = src_dict.string(src_tokens, args.remove_bpe)
                    else:
                        src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

                if not args.quiet:
                    if src_dict is not None:
                        print('S-{}\t{}'.format(sample_id, src_str))
                    if has_target:
                        print('T-{}\t{}'.format(sample_id, target_str))

                # Process top predictions
                for j, hypo in enumerate(hypos[i][:args.nbest]):
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str=src_str,
                        alignment=hypo['alignment'],
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )

                    if not args.quiet:
                        print('H-{}\t{}\t{}'.format(sample_id, hypo['score'],
                                                    hypo_str))
                        print('P-{}\t{}'.format(
                            sample_id, ' '.join(
                                map(
                                    lambda x: '{:.4f}'.format(x),
                                    hypo['positional_scores'].tolist(),
                                ))))

                        if args.print_alignment:
                            print('A-{}\t{}'.format(
                                sample_id, ' '.join([
                                    '{}-{}'.format(src_idx, tgt_idx)
                                    for src_idx, tgt_idx in alignment
                                ])))

                        if args.print_step:
                            print('I-{}\t{}'.format(sample_id, hypo['steps']))

                        if getattr(args, 'retain_iter_history', False):
                            print("\n".join([
                                'E-{}_{}\t{}'.format(
                                    sample_id, step,
                                    utils.post_process_prediction(
                                        h['tokens'].int().cpu(), src_str, None,
                                        None, tgt_dict, None)[1])
                                for step, h in enumerate(hypo['history'])
                            ]))

                    if args.decoding_path is not None:
                        src_sents[int(sample_id)].append(src_str)
                        tgt_sents[int(sample_id)].append(target_str)
                        hyp_sents[int(sample_id)].append(hypo_str)

                    # Score only the top hypothesis
                    if has_target and j == 0:
                        if align_dict is not None or args.remove_bpe is not None:
                            # Convert back to tokens for evaluation with unk replacement and/or without BPE
                            target_tokens = tgt_dict.encode_line(
                                target_str, add_if_not_exist=True)
                        if hasattr(scorer, 'add_string'):
                            scorer.add_string(target_str, hypo_str)
                        else:
                            scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(num_generated_tokens)
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += sample['nsentences']

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.result_string()))

    if args.decoding_path is not None:
        with open(os.path.join(args.decoding_path, 'source.txt'),
                  'w',
                  encoding='utf-8') as f:
            for sents in src_sents:
                if len(sents) == 0:
                    continue
                for sent in sents:
                    f.write(sent + '\n')

        with open(os.path.join(args.decoding_path, 'target.txt'),
                  'w',
                  encoding='utf-8') as f:
            for sents in tgt_sents:
                if len(sents) == 0:
                    continue
                for sent in sents:
                    f.write(sent + '\n')

        with open(os.path.join(args.decoding_path, 'decoding.txt'),
                  'w',
                  encoding='utf-8') as f:
            for sents in hyp_sents:
                if len(sents) == 0:
                    continue
                for sent in sents:
                    f.write(sent + '\n')

    if len(list(args.num_ref.values())) == 1:
        num_ref = int(list(args.num_ref.values())[0])
    else:
        raise NotImplementedError

    ref_path = []

    if num_ref == 1:
        ref_path.append(
            os.path.join(args.valid_decoding_path,
                         args.gen_subset + '.tok.' + args.target_lang))
    else:
        for i in range(num_ref):
            ref_path.append(
                os.path.join(
                    args.valid_decoding_path,
                    args.gen_subset + '.tok.' + args.target_lang + str(i)))

    decoding_path = os.path.join(args.decoding_path, 'decoding.txt')

    #with open(decoding_path) as out_file:
    #    out_file.seek(0)
    #    subprocess.call(
    #        'perl %s/multi-bleu.perl %s' % (args.multi_bleu_path, ' '.join(ref_path)),
    #        stdin=out_file, shell=True)

    return scorer
コード例 #21
0
def main(args):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)
    print('| {} {} {} examples'.format(args.data, args.gen_subset,
                                       len(task.dataset(args.gen_subset))))

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _ = utils.load_ensemble_for_inference(args.path.split(':'),
                                                  task,
                                                  model_arg_overrides=eval(
                                                      args.model_overrides))

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam)
        if args.fp16:
            model.half()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = data.EpochBatchIterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=models[0].max_positions(),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    if args.score_reference:
        translator = SequenceScorer(models, task.target_dictionary)
    else:
        translator = SequenceGenerator(
            models,
            task.target_dictionary,
            beam_size=args.beam,
            stop_early=(not args.no_early_stop),
            normalize_scores=(not args.unnormalized),
            len_penalty=args.lenpen,
            unk_penalty=args.unkpen,
            sampling=args.sampling,
            sampling_topk=args.sampling_topk,
            minlen=args.min_len,
        )

    if use_cuda:
        translator.cuda()

    # Generate and compute BLEU score
    scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True
    with progress_bar.build_progress_bar(args, itr) as t:
        if args.score_reference:
            translations = translator.score_batched_itr(t,
                                                        cuda=use_cuda,
                                                        timer=gen_timer)
        else:
            translations = translator.generate_batched_itr(
                t,
                maxlen_a=args.max_len_a,
                maxlen_b=args.max_len_b,
                cuda=use_cuda,
                timer=gen_timer,
                prefix_size=args.prefix_size,
            )

        wps_meter = TimeMeter()
        for sample_id, src_tokens, target_tokens, hypos in translations:
            # Process input and ground truth
            has_target = target_tokens is not None
            target_tokens = target_tokens.int().cpu() if has_target else None

            # Either retrieve the original sentences or regenerate them from tokens.
            if align_dict is not None:
                src_str = task.dataset(
                    args.gen_subset).src.get_original_text(sample_id)
                target_str = task.dataset(
                    args.gen_subset).tgt.get_original_text(sample_id)
            else:
                src_str = src_dict.string(src_tokens, args.remove_bpe)
                if has_target:
                    target_str = tgt_dict.string(target_tokens,
                                                 args.remove_bpe,
                                                 escape_unk=True)

            if not args.quiet:
                print('S-{}\t{}'.format(sample_id, src_str))
                if has_target:
                    print('T-{}\t{}'.format(sample_id, target_str))

            # Process top predictions
            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    hypo_tokens=hypo['tokens'].int().cpu(),
                    src_str=src_str,
                    alignment=hypo['alignment'].int().cpu(),
                    align_dict=align_dict,
                    tgt_dict=tgt_dict,
                    remove_bpe=args.remove_bpe,
                )

                if not args.quiet:
                    print('H-{}\t{}\t{}'.format(sample_id, hypo['score'],
                                                hypo_str))
                    print('P-{}\t{}'.format(
                        sample_id, ' '.join(
                            map(
                                lambda x: '{:.4f}'.format(x),
                                hypo['positional_scores'].tolist(),
                            ))))
                    print('A-{}\t{}'.format(
                        sample_id,
                        ' '.join(map(lambda x: str(utils.item(x)),
                                     alignment))))

                # Score only the top hypothesis
                if has_target and i == 0:
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tokenizer.Tokenizer.tokenize(
                            target_str, tgt_dict, add_if_not_exist=True)
                    scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.result_string()))
コード例 #22
0
    def forward(
        self,
        sample,
        forward_model,
        forward_optimizer,
        tgt_dict,
        backward_model,
        backward_optimizer,
        src_dict,
        lm_scorer=None,
        reduce=True,
        **generate_kwargs,
    ):
        """Compute the reconstruction and LM loss from forward and backward
        models.

        Args:
            sample: original input.
            hypos: psudo labels generated by the forward model. They are used
                as approximation of the target space to do importance sampling.
            forward_model: the model used to generate psuedo labels.
            backward_model: the model to reconstruct original input using
                psuedo labels.
            lm_scorer: an LM model eval mode to score psuedo labels in target
                space.
        """
        # Generate translations
        nbest_translations = self._generate_translation(
            forward_model, tgt_dict, sample, **generate_kwargs)

        forward_samples = []
        backward_samples = []
        # TODO (T36875783): load pretrained lm to score
        lm_score = 0.5
        eos_index = tgt_dict.eos()
        for id, src, hypos in nbest_translations:
            # compute each model's reward
            forward_reward = lm_score
            # construct the sample; compute the ce loss
            # backward_samples need to handle EOS
            original_src = src
            bt_src = hypos[0]["tokens"]
            # add EOS to the target, i.e. original source, since it'll be used
            # as target
            if original_src[-1] != eos_index:
                original_src = torch.cat(
                    [original_src.cpu(),
                     torch.LongTensor([eos_index])])
            # remove EOS in the src is optional
            if self.remove_eos_at_src:
                bt_src = bt_src[:-1]
            backward_sample = {
                "id": id,
                "source": bt_src.cpu(),  # first hypo is best hypo
                "target": original_src.cpu(),
                "weight": 1.0 - self.alpha,
            }
            backward_samples.append(backward_sample)
            # use bleu score as reward
            bwd_model_input = utils.move_to_cuda(
                WeightedLanguagePairDataset.collate(
                    samples=[backward_sample],
                    pad_idx=src_dict.pad(),
                    eos_idx=src_dict.eos(),
                ))
            reconstructed_source = self._generate_translation(
                backward_model, src_dict, bwd_model_input, **generate_kwargs)
            scorer = bleu.Scorer(src_dict.pad(), src_dict.eos(),
                                 src_dict.unk())
            for _, _, x_hypos in reconstructed_source:
                x_hat = x_hypos[0]["tokens"][:-1]
                scorer.add(original_src.int().cpu(), x_hat.int().cpu())
            backward_reward = scorer.score(order=4) / 100.0

            total_reward = (self.alpha * forward_reward +
                            (1.0 - self.alpha) * backward_reward)

            assert hypos[0]["tokens"][-1] == eos_index, (
                f"Expected generated translation to have eos (id: "
                f"{eos_index}) at end, but instead found token id "
                f"{hypos[0]['tokens'][-1]} at end.")
            forward_samples.append({
                "id": id,
                "source": src.cpu(),
                "target": hypos[0]["tokens"].cpu(),  # first hypo is best hypo
                "weight": total_reward,
            })

        # Now combine pseudo labelled examples to corresponding batch with
        # rewards factored to weighting of each task's loss
        agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {}
        forward_model.train()
        forward_loss, sample_size, logging_output = self.task.criterion(
            forward_model,
            utils.move_to_cuda(
                WeightedLanguagePairDataset.collate(
                    samples=forward_samples,
                    pad_idx=tgt_dict.pad(),
                    eos_idx=tgt_dict.eos(),
                )),
        )
        agg_loss += forward_loss.detach().item()
        agg_sample_size += sample_size
        agg_logging_output["primal"] = logging_output
        # grad would be further scaled when passed back to trainer,
        # which will do the update
        forward_optimizer.backward(forward_loss)

        backward_model.train()
        backward_loss, sample_size, logging_output = self.task.criterion(
            backward_model,
            utils.move_to_cuda(
                WeightedLanguagePairDataset.collate(
                    samples=backward_samples,
                    pad_idx=src_dict.pad(),
                    eos_idx=src_dict.eos(),
                )),
        )

        agg_loss += backward_loss.data.item()
        agg_sample_size += sample_size
        agg_logging_output["dual"] = logging_output
        backward_optimizer.backward(backward_loss)
        return agg_loss, agg_sample_size, agg_logging_output