Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(
        description=("Rescore generated hypotheses with extra models"))
    add_args(parser)
    add_args_rescore(parser)
    args = parser.parse_args()

    assert (args.translation_info_export_path is not None
            ), "--translation_info_export_path is required for rescoring"

    assert args.l2r_model_path is not None, "Rescoring needs forward model"

    _, _, forward_task = utils.load_diverse_ensemble_for_inference(
        [args.l2r_model_path])
    rescorer = Rescorer(args, forward_task)
    dst_dict = forward_task.tgt_dict
    base_bleu_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                                   dst_dict.unk())
    rescoring_bleu_scorer = bleu.Scorer(
        bleu.BleuConfig(
            pad=dst_dict.pad(),
            eos=dst_dict.eos(),
            unk=dst_dict.unk(),
        ))

    with open(args.translation_info_export_path, "rb") as file:
        translation_info_list = pickle.load(file)

    scores_to_export_list = []
    trans_batch_info = []
    for k in tqdm(range(0, len(translation_info_list), args.batch_size)):
        trans_batch_info = translation_info_list[k:k + args.batch_size]
        for j in range(len(trans_batch_info)):
            trans_batch_info[j]["hypos"] = [{
                "score": hypo["score"],
                "tokens": hypo["tokens"].cuda()
            } for hypo in trans_batch_info[j]["hypos"]]
        top_tokens, scores_to_export = find_top_tokens(args, trans_batch_info,
                                                       rescorer,
                                                       dst_dict.pad())
        if args.scores_info_export_path is not None:
            scores_to_export_list += scores_to_export

        for i, trans_info in enumerate(trans_batch_info):
            base_bleu_scorer.add(
                trans_info["target_tokens"].int().cpu(),
                trans_info["hypos"][0]["tokens"].int().cpu(),
            )
            rescoring_bleu_scorer.add(trans_info["target_tokens"].int().cpu(),
                                      top_tokens[i].int().cpu())
        trans_batch_info = []

    print("| Base ", base_bleu_scorer.result_string())
    print("| Rescoring ", rescoring_bleu_scorer.result_string())

    if args.scores_info_export_path is not None:
        with open(args.scores_info_export_path, "wb") as file:
            pickle.dump(scores_to_export_list, file)
Esempio n. 2
0
    def calculate_metric(self):
        total_exact_match = 0
        total_f1 = 0.0
        num_samples = len(self.all_targets)

        trg_vocab = self.tensorizers["trg_seq_tokens"].vocab
        bleu_scorer = bleu.Scorer(
            bleu.BleuConfig(
                pad=trg_vocab.get_pad_index(),
                eos=trg_vocab.get_eos_index(),
                unk=trg_vocab.get_unk_index(),
            ))

        for (beam_preds, target) in zip(self.all_preds, self.all_targets):
            pred = beam_preds[0]
            if self._compare_target_prediction_tokens(pred, target):
                total_exact_match += 1
            total_f1 += compute_f1(pred, target)
            # Bleu Metric calculation is always done with tensors on CPU or
            # type checks in fairseq/bleu.py:add() will fail
            bleu_scorer.add(
                torch.IntTensor(target).cpu(),
                torch.IntTensor(pred).cpu())

        loss = self.calculate_loss()
        exact_match = round(
            safe_division(total_exact_match, num_samples) * 100.0, 2)
        f1 = round(safe_division(total_f1, num_samples) * 100.0, 2)
        bleu_score = round(
            0.0 if len(self.all_preds) == 0 else bleu_scorer.score(), 2)

        return Seq2SeqMetrics(loss, exact_match, f1, bleu_score)
Esempio n. 3
0
 def __init__(self, tgt_dict, bpe_symbol='@@ '):
     self.tgt_dict = tgt_dict
     self.bpe_symbol = bpe_symbol
     self.scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(),
                               tgt_dict.unk())
     # use a fresh Dictionary for scoring, so that we can add new elements
     self.scoring_dict = Dictionary()
Esempio n. 4
0
def smoothed_sentence_bleu(task, target_tokens, hypo_tokens):
    """
    Implements "Smoothing 3" method from Chen and Cherry. "A Systematic
    Comparison of Smoothing Techniques for Sentence-Level BLEU".
    http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
    """
    dst_dict = task.target_dictionary
    scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())
    scorer.add(target_tokens, hypo_tokens)

    invcnt = 1
    ratios = []
    for (match, count) in [
        (scorer.stat.match1, scorer.stat.count1),
        (scorer.stat.match2, scorer.stat.count2),
        (scorer.stat.match3, scorer.stat.count3),
        (scorer.stat.match4, scorer.stat.count4),
    ]:
        if count == 0:
            # disregard n-grams for values of n larger than hypothesis length
            continue
        if match == 0:
            invcnt *= 2
            match = 1.0 / invcnt
        ratios.append(match / count)

    brevity_penalty = np.min(
        [1, np.exp(1 - (scorer.stat.reflen / scorer.stat.predlen))]
    )
    geometric_mean = np.exp(np.log(ratios).mean())
    smoothed_bleu = brevity_penalty * geometric_mean
    return smoothed_bleu
Esempio n. 5
0
 def score(fdsys):
     with open(args.ref) as fdref:
         scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
         for sys_tok, ref_tok in zip(readlines(fdsys),
                                     readlines(fdref)):
             sys_tok = dict.encode_line(sys_tok)
             ref_tok = dict.encode_line(ref_tok)
             scorer.add(ref_tok, sys_tok)
         print(scorer.result_string(args.order))
Esempio n. 6
0
 def __init__(self, args, src_dict, dst_dict):
     super().__init__(args, src_dict, dst_dict)
     self.translator = None
     self.scorer = bleu.Scorer(
         bleu.BleuConfig(
             pad=dst_dict.pad(),
             eos=dst_dict.eos(),
             unk=dst_dict.unk(),
         )
     )
Esempio n. 7
0
 def score(fdsys):
     with open(args.ref) as fdref:
         scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
         for i, (sys_tok, ref_tok) in enumerate(
                 zip(readlines(fdsys), readlines(fdref))):
             scorer.reset(one_init=True)
             sys_tok = dict.encode_line(sys_tok)
             ref_tok = dict.encode_line(ref_tok)
             scorer.add(ref_tok, sys_tok)
             print(i, scorer.result_string(args.order))
Esempio n. 8
0
def build_scorer(choice, tgt_dict):
    _choice = choice._name if isinstance(choice, DictConfig) else choice

    if _choice == "bleu":
        from fairseq.scoring import bleu

        return bleu.Scorer(
            bleu.BleuConfig(pad=tgt_dict.pad(), eos=tgt_dict.eos(), unk=tgt_dict.unk())
        )
    return _build_scorer(choice)
Esempio n. 9
0
def build_scorer(args, tgt_dict):
    from fairseq import utils

    if args.sacrebleu:
        utils.deprecation_warning(
            "--sacrebleu is deprecated. Please use --scoring sacrebleu instead."
        )
        args.scoring = "sacrebleu"
    if args.scoring == "bleu":
        from fairseq.scoring import bleu
        return bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    return _build_scorer(args)
Esempio n. 10
0
def evaluate_weights(scores_info, feature_weights, length_penalty):
    scorer = bleu.Scorer(vocab_constants.PAD_ID, vocab_constants.EOS_ID,
                         vocab_constants.UNK_ID)

    for example in scores_info:
        weighted_scores = (example["scores"] * feature_weights).sum(axis=1)
        weighted_scores /= (example["tgt_len"]**length_penalty) + 1e-12
        top_hypo_ind = np.argmax(weighted_scores)
        top_hypo = example["hypos"][top_hypo_ind]
        ref = example["target_tokens"]
        scorer.add(torch.IntTensor(ref), torch.IntTensor(top_hypo))

    return scorer.score()
Esempio n. 11
0
 def compute_many(guess: torch.Tensor, answers: torch.Tensor, pad_idx,
                  end_idx, unk_idx):
     """
     Return BLEU-1..4 using fairseq and tokens.
     """
     if fairseqbleu is None:
         return None
     scorer = fairseqbleu.Scorer(pad_idx, end_idx, unk_idx)
     answers = answers.cpu().int()
     guess = guess.cpu().int()
     scorer.add(answers, guess)
     return [
         FairseqBleuMetric(scorer.score(i) / 100.0) for i in range(1, 5)
     ]
Esempio n. 12
0
    def calculate_metric(self):
        num_correct = 0
        total_count = len(self.all_targets)
        trg_vocab = self.tensorizers["trg_seq_tokens"].vocab
        bleu_scorer = bleu.Scorer(
            trg_vocab.get_pad_index(),
            trg_vocab.get_eos_index(),
            trg_vocab.get_unk_index(),
        )
        for beam_pred, target in zip(self.all_preds, self.all_targets):
            pred = beam_pred[0]
            if self._compare_target_prediction_tokens(pred, target):
                num_correct = num_correct + 1
            # Bleu Metric calculation is always done with tensors on CPU or
            # type checks in fairseq/bleu.py:add() will fail
            bleu_scorer.add(
                torch.IntTensor(target).cpu(),
                torch.IntTensor(pred).cpu())

        bleu_score = 0.0 if len(self.all_preds) == 0 else bleu_scorer.score()
        accuracy = safe_division(num_correct, total_count)
        cross_entropy_loss = self.calculate_loss()
        return Seq2SeqMetrics(accuracy, cross_entropy_loss, bleu_score)
Esempio n. 13
0
def main(args):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # ========== for bartsv task, rebuild dictionary after model args are loaded ==========
    # assert not hasattr(args, 'node_freq_min'), 'node_freq_min should be read from model args'
    # args.node_freq_min = 5    # temporarily set before model loading, as this is needed in tasks.setup_task(args)
    # =====================================================================================

    # Load dataset splits
    task = tasks.setup_task(args)
    # Note: states are not needed since they will be provided by the state
    # machine
    task.load_dataset(args.gen_subset, state_machine=False)

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    try:
        models, _model_args = checkpoint_utils.load_model_ensemble(
            args.path.split(':'),
            arg_overrides=eval(args.model_overrides),
            task=task,
        )
    except:
        # NOTE this is for "bartsv" models when default "args.node_freq_min" (5) is not equal to the model
        #      when loading model with the above task there will be an error when building the model with the task's
        #      target vocabulary, which would be of different size
        # TODO better handle these cases (without sacrificing compatibility with other model archs)
        models, _model_args = checkpoint_utils.load_model_ensemble(
            args.path.split(':'),
            arg_overrides=eval(args.model_overrides),
            task=None,
        )

    # ========== for bartsv task, rebuild the dictionary based on model args ==========
    if 'bartsv' in _model_args.arch and args.node_freq_min != _model_args.node_freq_min:
        args.node_freq_min = _model_args.node_freq_min
        # Load dataset splits
        task = tasks.setup_task(args)
        # Note: states are not needed since they will be provided by the state machine
        task.load_dataset(args.gen_subset, state_machine=False)

        # Set dictionaries
        try:
            src_dict = getattr(task, 'source_dictionary', None)
        except NotImplementedError:
            src_dict = None
        tgt_dict = task.target_dictionary
    # ==================================================================================

    # import pdb; pdb.set_trace()
    # print(_model_args)

    # ========== for previous model trained when new arguments were not there ==========
    if not hasattr(_model_args, 'shift_pointer_value'):
        _model_args.shift_pointer_value = 1
    # ==================================================================================

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align
    # dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=None,
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
        # large_sent_first=False        # not in fairseq
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args, _model_args)

    # Generate and compute BLEU score
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    num_sentences = 0
    has_target = True

    examples = Examples(args.path, args.results_path, args.gen_subset,
                        args.nbest)

    error_stats = {'num_sub_start': 0}

    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        for sample in t:
            sample = utils.move_to_cuda(sample) if use_cuda else sample
            if 'net_input' not in sample:
                raise Exception("Did not expect empty sample")
                continue

            prefix_tokens = None
            if args.prefix_size > 0:
                prefix_tokens = sample['target'][:, :args.prefix_size]

            # breakpoint()

            gen_timer.start()
            hypos = task.inference_step(generator, models, sample, args,
                                        prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            # breakpoint()

            for i, sample_id in enumerate(sample['id'].tolist()):
                has_target = sample['target'] is not None

                # Remove padding
                src_tokens = utils.strip_pad(
                    sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()

                # Either retrieve the original sentences or regenerate them from tokens.
                if align_dict is not None:
                    src_str = task.dataset(
                        args.gen_subset).src.get_original_text(sample_id)
                    target_str = task.dataset(
                        args.gen_subset).tgt.get_original_text(sample_id)
                else:
                    if src_dict is not None:
                        src_str = src_dict.string(src_tokens, args.remove_bpe)
                    else:
                        src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

                # debug: '<<unk>>' is added to the dictionary
                # if 'unk' in target_str:
                #     breakpoint()
                # ==========> NOTE we do not really have the ground truth target (with the same alignments)
                #                  target_str might have <unk> as the target dictionary is only built on training data
                #                  but it doesn't matter. It should not affect the target dictionary!

                if not args.quiet:
                    if src_dict is not None:
                        print('S-{}\t{}'.format(sample_id, src_str))
                    if has_target:
                        print('T-{}\t{}'.format(sample_id, target_str))

                # Process top predictions
                for j, hypo in enumerate(hypos[i][:args.nbest]):
                    # hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                    #     hypo_tokens=hypo['tokens'].int().cpu(),
                    #     src_str=src_str,
                    #     alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None,
                    #     align_dict=align_dict,
                    #     tgt_dict=tgt_dict,
                    #     remove_bpe=args.remove_bpe,
                    #     # FIXME: AMR specific
                    #     split_token="\t",
                    #     line_tokenizer=task.tokenize,
                    # )

                    if 'bartsv' in _model_args.arch:
                        if not tgt_dict[hypo['tokens'][0]].startswith(
                                tgt_dict.bpe.INIT):
                            error_stats['num_sub_start'] += 1

                        try:
                            actions_nopos, actions_pos, actions = post_process_action_pointer_prediction_bartsv(
                                hypo, tgt_dict)
                        except:
                            breakpoint()
                    else:
                        actions_nopos, actions_pos, actions = post_process_action_pointer_prediction(
                            hypo, tgt_dict)

                    # breakpoint()

                    if args.clean_arcs:
                        actions_nopos, actions_pos, actions, invalid_idx = clean_pointer_arcs(
                            actions_nopos, actions_pos, actions)

                    # TODO these are just dummy for the reference below to run
                    hypo_tokens = hypo['tokens'].int().cpu()
                    hypo_str = '/t'.join(actions)
                    alignment = None

                    # update the list of examples
                    examples.append({
                        'actions_nopos': actions_nopos,
                        'actions_pos': actions_pos,
                        'actions': actions,
                        'reference': target_str,
                        'src_str': src_str,
                        'sample_id': sample_id
                    })

                    if not args.quiet:
                        print('H-{}\t{}\t{}'.format(sample_id, hypo_str,
                                                    hypo['score']))
                        print('P-{}\t{}'.format(
                            sample_id, ' '.join(
                                map(
                                    lambda x: '{:.4f}'.format(x),
                                    hypo['positional_scores'].tolist(),
                                ))))

                        if args.print_alignment:
                            print('A-{}\t{}'.format(
                                sample_id, ' '.join(
                                    map(lambda x: str(utils.item(x)),
                                        alignment))))

                    # Score only the top hypothesis
                    if has_target and j == 0:
                        if align_dict is not None or args.remove_bpe is not None:
                            # Convert back to tokens for evaluation with unk replacement and/or without BPE
                            target_tokens = tgt_dict.encode_line(
                                target_str, add_if_not_exist=False)
                            # NOTE do not modify the tgt dictionary with 'add_if_not_exist=True'!
                        if hasattr(scorer, 'add_string'):
                            scorer.add_string(target_str, hypo_str)
                        else:
                            scorer.add(target_tokens, hypo_tokens)

            wps_meter.update(num_generated_tokens)
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += sample['nsentences']

    # Save examples to files
    examples.save()

    print('| Error case (handled by manual fix) statistics:')
    print(error_stats)

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if has_target:
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.result_string()))
    return scorer
Esempio n. 14
0
 def __init__(self, args, src_dict, dst_dict):
     super().__init__(args, src_dict, dst_dict)
     self.translator = None
     self.scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(),
                               dst_dict.unk())
Esempio n. 15
0
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile,
                      write_hypos, normalize):

    print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c)
    gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files(
        args)
    dict = dictionary.Dictionary()
    scorer = scorer = bleu.Scorer(
        bleu.BleuConfig(
            pad=dict.pad(),
            eos=dict.eos(),
            unk=dict.unk(),
        ))

    ordered_hypos = {}
    ordered_targets = {}

    for shard_id in range(len(bitext1_lst)):
        bitext1 = bitext1_lst[shard_id]
        bitext2 = bitext2_lst[shard_id]
        gen_output = gen_output_lst[shard_id]
        lm_res = lm_res_lst[shard_id]

        total = len(bitext1.rescore_source.keys())
        source_lst = []
        hypo_lst = []
        score_lst = []
        reference_lst = []
        j = 1
        best_score = -math.inf

        for i in range(total):
            # length is measured in terms of words, not bpe tokens, since models may not share the same bpe
            target_len = len(bitext1.rescore_hypo[i].split())

            if lm_res is not None:
                lm_score = lm_res.score[i]
            else:
                lm_score = 0

            if bitext2 is not None:
                bitext2_score = bitext2.rescore_score[i]
                bitext2_backwards = bitext2.backwards
            else:
                bitext2_score = None
                bitext2_backwards = None

            score = rerank_utils.get_score(
                a,
                b,
                c,
                target_len,
                bitext1.rescore_score[i],
                bitext2_score,
                lm_score=lm_score,
                lenpen=lenpen,
                src_len=bitext1.source_lengths[i],
                tgt_len=bitext1.target_lengths[i],
                bitext1_backwards=bitext1.backwards,
                bitext2_backwards=bitext2_backwards,
                normalize=normalize,
            )

            if score > best_score:
                best_score = score
                best_hypo = bitext1.rescore_hypo[i]

            if j == gen_output.num_hypos[i] or j == args.num_rescore:
                j = 1
                hypo_lst.append(best_hypo)
                score_lst.append(best_score)
                source_lst.append(bitext1.rescore_source[i])
                reference_lst.append(bitext1.rescore_target[i])

                best_score = -math.inf
                best_hypo = ""
            else:
                j += 1

        gen_keys = list(sorted(gen_output.no_bpe_target.keys()))

        for key in range(len(gen_keys)):
            if args.prefix_len is None:
                assert hypo_lst[key] in gen_output.no_bpe_hypo[
                    gen_keys[key]], ("pred and rescore hypo mismatch: i: " +
                                     str(key) + ", " + str(hypo_lst[key]) +
                                     str(gen_keys[key]) +
                                     str(gen_output.no_bpe_hypo[key]))
                sys_tok = dict.encode_line(hypo_lst[key])
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                scorer.add(ref_tok, sys_tok)

            else:
                full_hypo = rerank_utils.get_full_from_prefix(
                    hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                sys_tok = dict.encode_line(full_hypo)
                ref_tok = dict.encode_line(
                    gen_output.no_bpe_target[gen_keys[key]])
                scorer.add(ref_tok, sys_tok)

        # if only one set of hyper parameters is provided, write the predictions to a file
        if write_hypos:
            # recover the orinal ids from n best list generation
            for key in range(len(gen_output.no_bpe_target)):
                if args.prefix_len is None:
                    assert hypo_lst[key] in gen_output.no_bpe_hypo[
                        gen_keys[key]], ("pred and rescore hypo mismatch:" +
                                         "i:" + str(key) + str(hypo_lst[key]) +
                                         str(gen_output.no_bpe_hypo[key]))
                    ordered_hypos[gen_keys[key]] = hypo_lst[key]
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]

                else:
                    full_hypo = rerank_utils.get_full_from_prefix(
                        hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]])
                    ordered_hypos[gen_keys[key]] = full_hypo
                    ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[
                        gen_keys[key]]

    # write the hypos in the original order from nbest list generation
    if args.num_shards == (len(bitext1_lst)):
        with open(target_outfile, "w") as t:
            with open(hypo_outfile, "w") as h:
                for key in range(len(ordered_hypos)):
                    t.write(ordered_targets[key])
                    h.write(ordered_hypos[key])

    res = scorer.result_string(4)
    if write_hypos:
        print(res)
    score = rerank_utils.parse_bleu_scoring(res)
    return score
Esempio n. 16
0
def _generate_score(models, args, task, dataset, modify_target_dict):
    use_cuda = torch.cuda.is_available() and not args.cpu

    # Load ensemble
    if not args.quiet:
        print(
            "| loading model(s) from {}".format(
                ", ".join(args.path.split(CHECKPOINT_PATHS_DELIMITER))
            )
        )

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=True,
        )

    translator = build_sequence_generator(args, task, models)
    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    print("seed number is" + str(args.max_examples_to_evaluate_seed))
    if args.max_examples_to_evaluate > 0:
        pytorch_translate_data.subsample_pair_dataset(
            dataset, args.max_examples_to_evaluate, args.max_examples_to_evaluate_seed
        )

    # Keep track of translations
    # Initialize with empty translations
    # and zero probs scores
    translated_sentences = [""] * len(dataset)
    translated_scores = [0.0] * len(dataset)
    hypos_list = []

    collect_output_hypos = getattr(args, "output_hypos_binary_path", False)
    if collect_output_hypos:
        output_hypos_token_arrays = [None] * len(dataset)

    # Generate and compute BLEU score
    dst_dict = task.target_dictionary
    if args.sacrebleu:
        scorer = bleu.SacrebleuScorer()
    else:
        scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())

    itr = task.get_batch_iterator(
        dataset=dataset,
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(), *[model.max_positions() for model in models]
        ),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    oracle_scorer = None
    if args.report_oracle_bleu:
        oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())

    rescorer = None
    num_sentences = 0
    translation_samples = []
    translation_info_list = []
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        gen_timer = StopwatchMeter()
        translations = translator.generate_batched_itr(
            t,
            maxlen_a=args.max_len_a,
            maxlen_b=args.max_len_b,
            cuda=use_cuda,
            timer=gen_timer,
            prefix_size=1
            if pytorch_translate_data.is_multilingual_many_to_one(args)
            else 0,
        )

        for trans_info in _iter_translations(
            args, task, dataset, translations, align_dict, rescorer, modify_target_dict
        ):
            if hasattr(scorer, "add_string"):
                scorer.add_string(trans_info.target_str, trans_info.hypo_str)
            else:
                scorer.add(trans_info.target_tokens, trans_info.hypo_tokens)
            if oracle_scorer is not None:
                oracle_scorer.add(trans_info.target_tokens, trans_info.best_hypo_tokens)

            if getattr(args, "translation_output_file", False):
                translated_sentences[trans_info.sample_id] = trans_info.hypo_str
            if getattr(args, "translation_probs_file", False):
                translated_scores[trans_info.sample_id] = trans_info.hypo_score
            if getattr(args, "hypotheses_export_path", False):
                hypos_list.append(trans_info.hypos)
            if collect_output_hypos:
                output_hypos_token_arrays[
                    trans_info.sample_id
                ] = trans_info.best_hypo_tokens
            if args.translation_info_export_path is not None:
                # Strip expensive data from hypotheses before saving
                hypos = [
                    {k: v for k, v in hypo.items() if k in ["tokens", "score"]}
                    for hypo in trans_info.hypos
                ]
                # Make sure everything is on cpu before exporting
                hypos = [
                    {"score": hypo["score"], "tokens": hypo["tokens"].cpu()}
                    for hypo in hypos
                ]
                translation_info_list.append(
                    {
                        "src_tokens": trans_info.src_tokens.cpu(),
                        "target_tokens": trans_info.target_tokens,
                        "hypos": hypos,
                    }
                )
            translation_samples.append(
                collections.OrderedDict(
                    {
                        "sample_id": trans_info.sample_id.item(),
                        "src_str": trans_info.src_str,
                        "target_str": trans_info.target_str,
                        "hypo_str": trans_info.hypo_str,
                    }
                )
            )
            wps_meter.update(trans_info.src_tokens.size(0))
            t.log({"wps": round(wps_meter.avg)})
            num_sentences += 1

    # If applicable, save collected hypothesis tokens to binary output file
    if collect_output_hypos:
        output_dataset = pytorch_translate_data.InMemoryIndexedDataset()
        output_dataset.load_from_sequences(output_hypos_token_arrays)
        output_dataset.save(args.output_hypos_binary_path)
    if args.output_source_binary_path:
        dataset.src.save(args.output_source_binary_path)
    if args.translation_info_export_path is not None:
        f = open(args.translation_info_export_path, "wb")
        pickle.dump(translation_info_list, f)
        f.close()

    # If applicable, save the translations and scores to the output files
    # These two ouputs are used in dual learning for weighted backtranslation
    if getattr(args, "translation_output_file", False) and getattr(
        args, "translation_probs_file", False
    ):
        with open(args.translation_output_file, "w") as translation_file, open(
            args.translation_probs_file, "w"
        ) as score_file:
            for hypo_str, hypo_score in zip(translated_sentences, translated_scores):
                if len(hypo_str.strip()) > 0:
                    print(hypo_str, file=translation_file)
                    print(np.exp(hypo_score), file=score_file)

    # For eg. external evaluation
    if getattr(args, "hypotheses_export_path", False):
        with open(args.hypotheses_export_path, "w") as out_file:
            for hypos in hypos_list:
                for hypo in hypos:
                    print(
                        task.tgt_dict.string(
                            hypo["tokens"], bpe_symbol=args.remove_bpe
                        ),
                        file=out_file,
                    )

    if oracle_scorer is not None:
        print(f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}")

    return scorer, num_sentences, gen_timer, translation_samples
Esempio n. 17
0
def random_search(scores_info_export_path,
                  num_trials,
                  report_oracle_bleu=False):
    with open(scores_info_export_path, "rb") as f:
        scores_info = pickle.load(f)

    dummy_task = DummyTask()

    if report_oracle_bleu:
        oracle_scorer = bleu.Scorer(vocab_constants.PAD_ID,
                                    vocab_constants.EOS_ID,
                                    vocab_constants.UNK_ID)

        for example in scores_info:
            smoothed_bleu = []
            for hypo in example["hypos"]:
                eval_score = smoothed_sentence_bleu(
                    dummy_task,
                    torch.IntTensor(example["target_tokens"]),
                    torch.IntTensor(hypo),
                )
                smoothed_bleu.append(eval_score)
            best_hypo_ind = np.argmax(smoothed_bleu)
            example["best_hypo_ind"] = best_hypo_ind

            oracle_scorer.add(
                torch.IntTensor(example["target_tokens"]),
                torch.IntTensor(example["hypos"][best_hypo_ind]),
            )

        print("oracle BLEU: ", oracle_scorer.score())

    num_features = scores_info[0]["scores"].shape[1]
    assert all(
        example["scores"].shape[1] == num_features for example in
        scores_info), "All examples must have the same number of scores!"
    feature_weights = np.zeros(num_features)
    feature_weights[0] = 1
    score = evaluate_weights(scores_info, feature_weights, length_penalty=1)
    print("base BLEU: ", score)
    best_score = score
    best_weights = feature_weights
    best_length_penalty = 0

    nonzero_features = identify_nonzero_features(scores_info)

    for i in range(num_trials):
        feature_weights = np.zeros(num_features)
        random_weights = np.random.dirichlet(np.ones(nonzero_features.size))
        feature_weights[nonzero_features] = random_weights
        length_penalty = 1.5 * np.random.random()

        score = evaluate_weights(scores_info, feature_weights, length_penalty)
        if score > best_score:
            best_score = score
            best_weights = feature_weights
            best_length_penalty = length_penalty

        print(f"\r[{i}]  best: {best_score}", end="", flush=True)

    print()
    print("best weights: ", best_weights)
    print("best length penalty: ", length_penalty)

    return best_weights, best_length_penalty, best_score
Esempio n. 18
0
    def forward(
        self,
        sample,
        forward_model,
        forward_optimizer,
        tgt_dict,
        backward_model,
        backward_optimizer,
        src_dict,
        lm_scorer=None,
        reduce=True,
        **generate_kwargs,
    ):
        """Compute the reconstruction and LM loss from forward and backward
        models.

        Args:
            sample: original input.
            hypos: psudo labels generated by the forward model. They are used
                as approximation of the target space to do importance sampling.
            forward_model: the model used to generate psuedo labels.
            backward_model: the model to reconstruct original input using
                psuedo labels.
            lm_scorer: an LM model eval mode to score psuedo labels in target
                space.
        """
        # Generate translations
        nbest_translations = self._generate_translation(
            forward_model, tgt_dict, sample, self.args.beam, **generate_kwargs)

        forward_samples = []
        backward_samples = {}
        # TODO (T36875783): load pretrained lm to score
        lm_score = 0.0
        for sample_id, src_processed, tgt_hypos in nbest_translations:
            # compute each model's reward
            forward_reward = lm_score
            # construct the sample; compute the ce loss
            # backward_samples need to handle EOS
            src = self._maybe_reverse_source(src_processed)
            src = self._maybe_add_eos(src, src_dict.eos())
            assert len(tgt_hypos) == self.args.beam
            for tgt_hypo_i, tgt_hypo_struct in enumerate(tgt_hypos):
                dual_sample_id = sample_id.item() * self.args.beam + tgt_hypo_i
                tgt_hypo = tgt_hypo_struct["tokens"]
                # add EOS to the target, i.e. original source, since it'll be used
                # as target
                # remove EOS in the src is optional
                if self.remove_eos_at_src:
                    tgt_hypo = tgt_hypo[:-1]
                tgt_hypo_processed = self._maybe_reverse_source(tgt_hypo)

                backward_sample = {
                    "id": dual_sample_id,
                    "source": tgt_hypo_processed.cpu(),
                    "target": src.cpu(),
                    "weight": 1.0 - self.alpha,
                }
                assert dual_sample_id not in backward_samples
                backward_samples[dual_sample_id] = backward_sample

        bwd_model_input = utils.move_to_cuda(
            WeightedLanguagePairDataset.collate(
                samples=list(backward_samples.values()),
                pad_idx=src_dict.pad(),
                eos_idx=src_dict.eos(),
            ))
        reconstructed_source = self._generate_translation(
            backward_model, src_dict, bwd_model_input, 1, **generate_kwargs)
        for dual_sample_id, tgt_hypo_processed, src_hypos in reconstructed_source:
            backward_sample = backward_samples[dual_sample_id.item()]
            src = backward_sample["target"]
            tgt_hypo = self._maybe_reverse_source(tgt_hypo_processed)

            # use bleu score as reward
            scorer = bleu.Scorer(src_dict.pad(), src_dict.eos(),
                                 src_dict.unk())
            assert len(src_hypos) == 1
            src_hypo = src_hypos[0]["tokens"][:-1]
            scorer.add(src.int().cpu(), src_hypo.int().cpu())
            backward_reward = (
                scorer.score(order=self.args.reconstruction_bleu_order) /
                100.0)

            original_stc = " ".join(src_dict[tid] for tid in src.tolist())
            translated_stc = " ".join(tgt_dict[tid] for tid in tgt_hypo)
            recon_stc = " ".join(src_dict[tid] for tid in src_hypo.tolist())

            if int(dual_sample_id / self.args.beam) % 100 == 0:
                print("--------")
                print(
                    "original sentence:",
                    original_stc.replace(self.args.source_bpe_end_marker, ""),
                )
                print(
                    "translated sentence:",
                    translated_stc.replace(self.args.source_bpe_end_marker,
                                           ""),
                )
                print(
                    "reconstructed sentence:",
                    recon_stc.replace(self.args.source_bpe_end_marker, ""),
                )
                print("reward:", backward_reward)
                print("--------")

            total_reward = (self.alpha * forward_reward +
                            (1.0 - self.alpha) * backward_reward)
            src_processed = self._maybe_reverse_source(src)
            tgt_hypo = self._maybe_add_eos(tgt_hypo, tgt_dict.eos())
            forward_samples.append({
                "id": dual_sample_id,
                "source": src_processed.cpu(),
                "target": tgt_hypo.cpu(),  # first hypo is best hypo
                "weight": total_reward,
            })

        # Now combine pseudo labelled examples to corresponding batch with
        # rewards factored to weighting of each task's loss
        agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {}
        forward_model.train()
        forward_loss, sample_size, logging_output = self.task.criterion(
            forward_model,
            utils.move_to_cuda(
                WeightedLanguagePairDataset.collate(
                    samples=forward_samples,
                    pad_idx=tgt_dict.pad(),
                    eos_idx=tgt_dict.eos(),
                )),
        )
        agg_loss += forward_loss.detach().item()
        agg_sample_size += sample_size
        agg_logging_output["primal"] = logging_output
        # grad would be further scaled when passed back to trainer,
        # which will do the update
        forward_optimizer.backward(forward_loss)

        backward_model.train()
        backward_loss, sample_size, logging_output = self.task.criterion(
            backward_model, bwd_model_input)

        agg_loss += backward_loss.data.item()
        agg_sample_size += sample_size
        agg_logging_output["dual"] = logging_output
        backward_optimizer.backward(backward_loss)
        return agg_loss, agg_sample_size, agg_logging_output