def main(): parser = argparse.ArgumentParser( description=("Rescore generated hypotheses with extra models")) add_args(parser) add_args_rescore(parser) args = parser.parse_args() assert (args.translation_info_export_path is not None ), "--translation_info_export_path is required for rescoring" assert args.l2r_model_path is not None, "Rescoring needs forward model" _, _, forward_task = utils.load_diverse_ensemble_for_inference( [args.l2r_model_path]) rescorer = Rescorer(args, forward_task) dst_dict = forward_task.tgt_dict base_bleu_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) rescoring_bleu_scorer = bleu.Scorer( bleu.BleuConfig( pad=dst_dict.pad(), eos=dst_dict.eos(), unk=dst_dict.unk(), )) with open(args.translation_info_export_path, "rb") as file: translation_info_list = pickle.load(file) scores_to_export_list = [] trans_batch_info = [] for k in tqdm(range(0, len(translation_info_list), args.batch_size)): trans_batch_info = translation_info_list[k:k + args.batch_size] for j in range(len(trans_batch_info)): trans_batch_info[j]["hypos"] = [{ "score": hypo["score"], "tokens": hypo["tokens"].cuda() } for hypo in trans_batch_info[j]["hypos"]] top_tokens, scores_to_export = find_top_tokens(args, trans_batch_info, rescorer, dst_dict.pad()) if args.scores_info_export_path is not None: scores_to_export_list += scores_to_export for i, trans_info in enumerate(trans_batch_info): base_bleu_scorer.add( trans_info["target_tokens"].int().cpu(), trans_info["hypos"][0]["tokens"].int().cpu(), ) rescoring_bleu_scorer.add(trans_info["target_tokens"].int().cpu(), top_tokens[i].int().cpu()) trans_batch_info = [] print("| Base ", base_bleu_scorer.result_string()) print("| Rescoring ", rescoring_bleu_scorer.result_string()) if args.scores_info_export_path is not None: with open(args.scores_info_export_path, "wb") as file: pickle.dump(scores_to_export_list, file)
def calculate_metric(self): total_exact_match = 0 total_f1 = 0.0 num_samples = len(self.all_targets) trg_vocab = self.tensorizers["trg_seq_tokens"].vocab bleu_scorer = bleu.Scorer( bleu.BleuConfig( pad=trg_vocab.get_pad_index(), eos=trg_vocab.get_eos_index(), unk=trg_vocab.get_unk_index(), )) for (beam_preds, target) in zip(self.all_preds, self.all_targets): pred = beam_preds[0] if self._compare_target_prediction_tokens(pred, target): total_exact_match += 1 total_f1 += compute_f1(pred, target) # Bleu Metric calculation is always done with tensors on CPU or # type checks in fairseq/bleu.py:add() will fail bleu_scorer.add( torch.IntTensor(target).cpu(), torch.IntTensor(pred).cpu()) loss = self.calculate_loss() exact_match = round( safe_division(total_exact_match, num_samples) * 100.0, 2) f1 = round(safe_division(total_f1, num_samples) * 100.0, 2) bleu_score = round( 0.0 if len(self.all_preds) == 0 else bleu_scorer.score(), 2) return Seq2SeqMetrics(loss, exact_match, f1, bleu_score)
def __init__(self, tgt_dict, bpe_symbol='@@ '): self.tgt_dict = tgt_dict self.bpe_symbol = bpe_symbol self.scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) # use a fresh Dictionary for scoring, so that we can add new elements self.scoring_dict = Dictionary()
def smoothed_sentence_bleu(task, target_tokens, hypo_tokens): """ Implements "Smoothing 3" method from Chen and Cherry. "A Systematic Comparison of Smoothing Techniques for Sentence-Level BLEU". http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf """ dst_dict = task.target_dictionary scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) scorer.add(target_tokens, hypo_tokens) invcnt = 1 ratios = [] for (match, count) in [ (scorer.stat.match1, scorer.stat.count1), (scorer.stat.match2, scorer.stat.count2), (scorer.stat.match3, scorer.stat.count3), (scorer.stat.match4, scorer.stat.count4), ]: if count == 0: # disregard n-grams for values of n larger than hypothesis length continue if match == 0: invcnt *= 2 match = 1.0 / invcnt ratios.append(match / count) brevity_penalty = np.min( [1, np.exp(1 - (scorer.stat.reflen / scorer.stat.predlen))] ) geometric_mean = np.exp(np.log(ratios).mean()) smoothed_bleu = brevity_penalty * geometric_mean return smoothed_bleu
def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): sys_tok = dict.encode_line(sys_tok) ref_tok = dict.encode_line(ref_tok) scorer.add(ref_tok, sys_tok) print(scorer.result_string(args.order))
def __init__(self, args, src_dict, dst_dict): super().__init__(args, src_dict, dst_dict) self.translator = None self.scorer = bleu.Scorer( bleu.BleuConfig( pad=dst_dict.pad(), eos=dst_dict.eos(), unk=dst_dict.unk(), ) )
def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for i, (sys_tok, ref_tok) in enumerate( zip(readlines(fdsys), readlines(fdref))): scorer.reset(one_init=True) sys_tok = dict.encode_line(sys_tok) ref_tok = dict.encode_line(ref_tok) scorer.add(ref_tok, sys_tok) print(i, scorer.result_string(args.order))
def build_scorer(choice, tgt_dict): _choice = choice._name if isinstance(choice, DictConfig) else choice if _choice == "bleu": from fairseq.scoring import bleu return bleu.Scorer( bleu.BleuConfig(pad=tgt_dict.pad(), eos=tgt_dict.eos(), unk=tgt_dict.unk()) ) return _build_scorer(choice)
def build_scorer(args, tgt_dict): from fairseq import utils if args.sacrebleu: utils.deprecation_warning( "--sacrebleu is deprecated. Please use --scoring sacrebleu instead." ) args.scoring = "sacrebleu" if args.scoring == "bleu": from fairseq.scoring import bleu return bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) return _build_scorer(args)
def evaluate_weights(scores_info, feature_weights, length_penalty): scorer = bleu.Scorer(vocab_constants.PAD_ID, vocab_constants.EOS_ID, vocab_constants.UNK_ID) for example in scores_info: weighted_scores = (example["scores"] * feature_weights).sum(axis=1) weighted_scores /= (example["tgt_len"]**length_penalty) + 1e-12 top_hypo_ind = np.argmax(weighted_scores) top_hypo = example["hypos"][top_hypo_ind] ref = example["target_tokens"] scorer.add(torch.IntTensor(ref), torch.IntTensor(top_hypo)) return scorer.score()
def compute_many(guess: torch.Tensor, answers: torch.Tensor, pad_idx, end_idx, unk_idx): """ Return BLEU-1..4 using fairseq and tokens. """ if fairseqbleu is None: return None scorer = fairseqbleu.Scorer(pad_idx, end_idx, unk_idx) answers = answers.cpu().int() guess = guess.cpu().int() scorer.add(answers, guess) return [ FairseqBleuMetric(scorer.score(i) / 100.0) for i in range(1, 5) ]
def calculate_metric(self): num_correct = 0 total_count = len(self.all_targets) trg_vocab = self.tensorizers["trg_seq_tokens"].vocab bleu_scorer = bleu.Scorer( trg_vocab.get_pad_index(), trg_vocab.get_eos_index(), trg_vocab.get_unk_index(), ) for beam_pred, target in zip(self.all_preds, self.all_targets): pred = beam_pred[0] if self._compare_target_prediction_tokens(pred, target): num_correct = num_correct + 1 # Bleu Metric calculation is always done with tensors on CPU or # type checks in fairseq/bleu.py:add() will fail bleu_scorer.add( torch.IntTensor(target).cpu(), torch.IntTensor(pred).cpu()) bleu_score = 0.0 if len(self.all_preds) == 0 else bleu_scorer.score() accuracy = safe_division(num_correct, total_count) cross_entropy_loss = self.calculate_loss() return Seq2SeqMetrics(accuracy, cross_entropy_loss, bleu_score)
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # ========== for bartsv task, rebuild dictionary after model args are loaded ========== # assert not hasattr(args, 'node_freq_min'), 'node_freq_min should be read from model args' # args.node_freq_min = 5 # temporarily set before model loading, as this is needed in tasks.setup_task(args) # ===================================================================================== # Load dataset splits task = tasks.setup_task(args) # Note: states are not needed since they will be provided by the state # machine task.load_dataset(args.gen_subset, state_machine=False) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) try: models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) except: # NOTE this is for "bartsv" models when default "args.node_freq_min" (5) is not equal to the model # when loading model with the above task there will be an error when building the model with the task's # target vocabulary, which would be of different size # TODO better handle these cases (without sacrificing compatibility with other model archs) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=None, ) # ========== for bartsv task, rebuild the dictionary based on model args ========== if 'bartsv' in _model_args.arch and args.node_freq_min != _model_args.node_freq_min: args.node_freq_min = _model_args.node_freq_min # Load dataset splits task = tasks.setup_task(args) # Note: states are not needed since they will be provided by the state machine task.load_dataset(args.gen_subset, state_machine=False) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # ================================================================================== # import pdb; pdb.set_trace() # print(_model_args) # ========== for previous model trained when new arguments were not there ========== if not hasattr(_model_args, 'shift_pointer_value'): _model_args.shift_pointer_value = 1 # ================================================================================== # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align # dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=None, ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, # large_sent_first=False # not in fairseq ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args, _model_args) # Generate and compute BLEU score if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) num_sentences = 0 has_target = True examples = Examples(args.path, args.results_path, args.gen_subset, args.nbest) error_stats = {'num_sub_start': 0} with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: raise Exception("Did not expect empty sample") continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] # breakpoint() gen_timer.start() hypos = task.inference_step(generator, models, sample, args, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) # breakpoint() for i, sample_id in enumerate(sample['id'].tolist()): has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) # debug: '<<unk>>' is added to the dictionary # if 'unk' in target_str: # breakpoint() # ==========> NOTE we do not really have the ground truth target (with the same alignments) # target_str might have <unk> as the target dictionary is only built on training data # but it doesn't matter. It should not affect the target dictionary! if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): # hypo_tokens, hypo_str, alignment = utils.post_process_prediction( # hypo_tokens=hypo['tokens'].int().cpu(), # src_str=src_str, # alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, # align_dict=align_dict, # tgt_dict=tgt_dict, # remove_bpe=args.remove_bpe, # # FIXME: AMR specific # split_token="\t", # line_tokenizer=task.tokenize, # ) if 'bartsv' in _model_args.arch: if not tgt_dict[hypo['tokens'][0]].startswith( tgt_dict.bpe.INIT): error_stats['num_sub_start'] += 1 try: actions_nopos, actions_pos, actions = post_process_action_pointer_prediction_bartsv( hypo, tgt_dict) except: breakpoint() else: actions_nopos, actions_pos, actions = post_process_action_pointer_prediction( hypo, tgt_dict) # breakpoint() if args.clean_arcs: actions_nopos, actions_pos, actions, invalid_idx = clean_pointer_arcs( actions_nopos, actions_pos, actions) # TODO these are just dummy for the reference below to run hypo_tokens = hypo['tokens'].int().cpu() hypo_str = '/t'.join(actions) alignment = None # update the list of examples examples.append({ 'actions_nopos': actions_nopos, 'actions_pos': actions_pos, 'actions': actions, 'reference': target_str, 'src_str': src_str, 'sample_id': sample_id }) if not args.quiet: print('H-{}\t{}\t{}'.format(sample_id, hypo_str, hypo['score'])) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis if has_target and j == 0: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=False) # NOTE do not modify the tgt dictionary with 'add_if_not_exist=True'! if hasattr(scorer, 'add_string'): scorer.add_string(target_str, hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] # Save examples to files examples.save() print('| Error case (handled by manual fix) statistics:') print(error_stats) print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) return scorer
def __init__(self, args, src_dict, dst_dict): super().__init__(args, src_dict, dst_dict) self.translator = None self.scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk())
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile, write_hypos, normalize): print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c) gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files( args) dict = dictionary.Dictionary() scorer = scorer = bleu.Scorer( bleu.BleuConfig( pad=dict.pad(), eos=dict.eos(), unk=dict.unk(), )) ordered_hypos = {} ordered_targets = {} for shard_id in range(len(bitext1_lst)): bitext1 = bitext1_lst[shard_id] bitext2 = bitext2_lst[shard_id] gen_output = gen_output_lst[shard_id] lm_res = lm_res_lst[shard_id] total = len(bitext1.rescore_source.keys()) source_lst = [] hypo_lst = [] score_lst = [] reference_lst = [] j = 1 best_score = -math.inf for i in range(total): # length is measured in terms of words, not bpe tokens, since models may not share the same bpe target_len = len(bitext1.rescore_hypo[i].split()) if lm_res is not None: lm_score = lm_res.score[i] else: lm_score = 0 if bitext2 is not None: bitext2_score = bitext2.rescore_score[i] bitext2_backwards = bitext2.backwards else: bitext2_score = None bitext2_backwards = None score = rerank_utils.get_score( a, b, c, target_len, bitext1.rescore_score[i], bitext2_score, lm_score=lm_score, lenpen=lenpen, src_len=bitext1.source_lengths[i], tgt_len=bitext1.target_lengths[i], bitext1_backwards=bitext1.backwards, bitext2_backwards=bitext2_backwards, normalize=normalize, ) if score > best_score: best_score = score best_hypo = bitext1.rescore_hypo[i] if j == gen_output.num_hypos[i] or j == args.num_rescore: j = 1 hypo_lst.append(best_hypo) score_lst.append(best_score) source_lst.append(bitext1.rescore_source[i]) reference_lst.append(bitext1.rescore_target[i]) best_score = -math.inf best_hypo = "" else: j += 1 gen_keys = list(sorted(gen_output.no_bpe_target.keys())) for key in range(len(gen_keys)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[ gen_keys[key]], ("pred and rescore hypo mismatch: i: " + str(key) + ", " + str(hypo_lst[key]) + str(gen_keys[key]) + str(gen_output.no_bpe_hypo[key])) sys_tok = dict.encode_line(hypo_lst[key]) ref_tok = dict.encode_line( gen_output.no_bpe_target[gen_keys[key]]) scorer.add(ref_tok, sys_tok) else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]]) sys_tok = dict.encode_line(full_hypo) ref_tok = dict.encode_line( gen_output.no_bpe_target[gen_keys[key]]) scorer.add(ref_tok, sys_tok) # if only one set of hyper parameters is provided, write the predictions to a file if write_hypos: # recover the orinal ids from n best list generation for key in range(len(gen_output.no_bpe_target)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[ gen_keys[key]], ("pred and rescore hypo mismatch:" + "i:" + str(key) + str(hypo_lst[key]) + str(gen_output.no_bpe_hypo[key])) ordered_hypos[gen_keys[key]] = hypo_lst[key] ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key]] else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]]) ordered_hypos[gen_keys[key]] = full_hypo ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key]] # write the hypos in the original order from nbest list generation if args.num_shards == (len(bitext1_lst)): with open(target_outfile, "w") as t: with open(hypo_outfile, "w") as h: for key in range(len(ordered_hypos)): t.write(ordered_targets[key]) h.write(ordered_hypos[key]) res = scorer.result_string(4) if write_hypos: print(res) score = rerank_utils.parse_bleu_scoring(res) return score
def _generate_score(models, args, task, dataset, modify_target_dict): use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble if not args.quiet: print( "| loading model(s) from {}".format( ", ".join(args.path.split(CHECKPOINT_PATHS_DELIMITER)) ) ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=True, ) translator = build_sequence_generator(args, task, models) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) print("seed number is" + str(args.max_examples_to_evaluate_seed)) if args.max_examples_to_evaluate > 0: pytorch_translate_data.subsample_pair_dataset( dataset, args.max_examples_to_evaluate, args.max_examples_to_evaluate_seed ) # Keep track of translations # Initialize with empty translations # and zero probs scores translated_sentences = [""] * len(dataset) translated_scores = [0.0] * len(dataset) hypos_list = [] collect_output_hypos = getattr(args, "output_hypos_binary_path", False) if collect_output_hypos: output_hypos_token_arrays = [None] * len(dataset) # Generate and compute BLEU score dst_dict = task.target_dictionary if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) oracle_scorer = None if args.report_oracle_bleu: oracle_scorer = bleu.Scorer(dst_dict.pad(), dst_dict.eos(), dst_dict.unk()) rescorer = None num_sentences = 0 translation_samples = [] translation_info_list = [] with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() gen_timer = StopwatchMeter() translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=1 if pytorch_translate_data.is_multilingual_many_to_one(args) else 0, ) for trans_info in _iter_translations( args, task, dataset, translations, align_dict, rescorer, modify_target_dict ): if hasattr(scorer, "add_string"): scorer.add_string(trans_info.target_str, trans_info.hypo_str) else: scorer.add(trans_info.target_tokens, trans_info.hypo_tokens) if oracle_scorer is not None: oracle_scorer.add(trans_info.target_tokens, trans_info.best_hypo_tokens) if getattr(args, "translation_output_file", False): translated_sentences[trans_info.sample_id] = trans_info.hypo_str if getattr(args, "translation_probs_file", False): translated_scores[trans_info.sample_id] = trans_info.hypo_score if getattr(args, "hypotheses_export_path", False): hypos_list.append(trans_info.hypos) if collect_output_hypos: output_hypos_token_arrays[ trans_info.sample_id ] = trans_info.best_hypo_tokens if args.translation_info_export_path is not None: # Strip expensive data from hypotheses before saving hypos = [ {k: v for k, v in hypo.items() if k in ["tokens", "score"]} for hypo in trans_info.hypos ] # Make sure everything is on cpu before exporting hypos = [ {"score": hypo["score"], "tokens": hypo["tokens"].cpu()} for hypo in hypos ] translation_info_list.append( { "src_tokens": trans_info.src_tokens.cpu(), "target_tokens": trans_info.target_tokens, "hypos": hypos, } ) translation_samples.append( collections.OrderedDict( { "sample_id": trans_info.sample_id.item(), "src_str": trans_info.src_str, "target_str": trans_info.target_str, "hypo_str": trans_info.hypo_str, } ) ) wps_meter.update(trans_info.src_tokens.size(0)) t.log({"wps": round(wps_meter.avg)}) num_sentences += 1 # If applicable, save collected hypothesis tokens to binary output file if collect_output_hypos: output_dataset = pytorch_translate_data.InMemoryIndexedDataset() output_dataset.load_from_sequences(output_hypos_token_arrays) output_dataset.save(args.output_hypos_binary_path) if args.output_source_binary_path: dataset.src.save(args.output_source_binary_path) if args.translation_info_export_path is not None: f = open(args.translation_info_export_path, "wb") pickle.dump(translation_info_list, f) f.close() # If applicable, save the translations and scores to the output files # These two ouputs are used in dual learning for weighted backtranslation if getattr(args, "translation_output_file", False) and getattr( args, "translation_probs_file", False ): with open(args.translation_output_file, "w") as translation_file, open( args.translation_probs_file, "w" ) as score_file: for hypo_str, hypo_score in zip(translated_sentences, translated_scores): if len(hypo_str.strip()) > 0: print(hypo_str, file=translation_file) print(np.exp(hypo_score), file=score_file) # For eg. external evaluation if getattr(args, "hypotheses_export_path", False): with open(args.hypotheses_export_path, "w") as out_file: for hypos in hypos_list: for hypo in hypos: print( task.tgt_dict.string( hypo["tokens"], bpe_symbol=args.remove_bpe ), file=out_file, ) if oracle_scorer is not None: print(f"| Oracle BLEU (best hypo in beam): {oracle_scorer.result_string()}") return scorer, num_sentences, gen_timer, translation_samples
def random_search(scores_info_export_path, num_trials, report_oracle_bleu=False): with open(scores_info_export_path, "rb") as f: scores_info = pickle.load(f) dummy_task = DummyTask() if report_oracle_bleu: oracle_scorer = bleu.Scorer(vocab_constants.PAD_ID, vocab_constants.EOS_ID, vocab_constants.UNK_ID) for example in scores_info: smoothed_bleu = [] for hypo in example["hypos"]: eval_score = smoothed_sentence_bleu( dummy_task, torch.IntTensor(example["target_tokens"]), torch.IntTensor(hypo), ) smoothed_bleu.append(eval_score) best_hypo_ind = np.argmax(smoothed_bleu) example["best_hypo_ind"] = best_hypo_ind oracle_scorer.add( torch.IntTensor(example["target_tokens"]), torch.IntTensor(example["hypos"][best_hypo_ind]), ) print("oracle BLEU: ", oracle_scorer.score()) num_features = scores_info[0]["scores"].shape[1] assert all( example["scores"].shape[1] == num_features for example in scores_info), "All examples must have the same number of scores!" feature_weights = np.zeros(num_features) feature_weights[0] = 1 score = evaluate_weights(scores_info, feature_weights, length_penalty=1) print("base BLEU: ", score) best_score = score best_weights = feature_weights best_length_penalty = 0 nonzero_features = identify_nonzero_features(scores_info) for i in range(num_trials): feature_weights = np.zeros(num_features) random_weights = np.random.dirichlet(np.ones(nonzero_features.size)) feature_weights[nonzero_features] = random_weights length_penalty = 1.5 * np.random.random() score = evaluate_weights(scores_info, feature_weights, length_penalty) if score > best_score: best_score = score best_weights = feature_weights best_length_penalty = length_penalty print(f"\r[{i}] best: {best_score}", end="", flush=True) print() print("best weights: ", best_weights) print("best length penalty: ", length_penalty) return best_weights, best_length_penalty, best_score
def forward( self, sample, forward_model, forward_optimizer, tgt_dict, backward_model, backward_optimizer, src_dict, lm_scorer=None, reduce=True, **generate_kwargs, ): """Compute the reconstruction and LM loss from forward and backward models. Args: sample: original input. hypos: psudo labels generated by the forward model. They are used as approximation of the target space to do importance sampling. forward_model: the model used to generate psuedo labels. backward_model: the model to reconstruct original input using psuedo labels. lm_scorer: an LM model eval mode to score psuedo labels in target space. """ # Generate translations nbest_translations = self._generate_translation( forward_model, tgt_dict, sample, self.args.beam, **generate_kwargs) forward_samples = [] backward_samples = {} # TODO (T36875783): load pretrained lm to score lm_score = 0.0 for sample_id, src_processed, tgt_hypos in nbest_translations: # compute each model's reward forward_reward = lm_score # construct the sample; compute the ce loss # backward_samples need to handle EOS src = self._maybe_reverse_source(src_processed) src = self._maybe_add_eos(src, src_dict.eos()) assert len(tgt_hypos) == self.args.beam for tgt_hypo_i, tgt_hypo_struct in enumerate(tgt_hypos): dual_sample_id = sample_id.item() * self.args.beam + tgt_hypo_i tgt_hypo = tgt_hypo_struct["tokens"] # add EOS to the target, i.e. original source, since it'll be used # as target # remove EOS in the src is optional if self.remove_eos_at_src: tgt_hypo = tgt_hypo[:-1] tgt_hypo_processed = self._maybe_reverse_source(tgt_hypo) backward_sample = { "id": dual_sample_id, "source": tgt_hypo_processed.cpu(), "target": src.cpu(), "weight": 1.0 - self.alpha, } assert dual_sample_id not in backward_samples backward_samples[dual_sample_id] = backward_sample bwd_model_input = utils.move_to_cuda( WeightedLanguagePairDataset.collate( samples=list(backward_samples.values()), pad_idx=src_dict.pad(), eos_idx=src_dict.eos(), )) reconstructed_source = self._generate_translation( backward_model, src_dict, bwd_model_input, 1, **generate_kwargs) for dual_sample_id, tgt_hypo_processed, src_hypos in reconstructed_source: backward_sample = backward_samples[dual_sample_id.item()] src = backward_sample["target"] tgt_hypo = self._maybe_reverse_source(tgt_hypo_processed) # use bleu score as reward scorer = bleu.Scorer(src_dict.pad(), src_dict.eos(), src_dict.unk()) assert len(src_hypos) == 1 src_hypo = src_hypos[0]["tokens"][:-1] scorer.add(src.int().cpu(), src_hypo.int().cpu()) backward_reward = ( scorer.score(order=self.args.reconstruction_bleu_order) / 100.0) original_stc = " ".join(src_dict[tid] for tid in src.tolist()) translated_stc = " ".join(tgt_dict[tid] for tid in tgt_hypo) recon_stc = " ".join(src_dict[tid] for tid in src_hypo.tolist()) if int(dual_sample_id / self.args.beam) % 100 == 0: print("--------") print( "original sentence:", original_stc.replace(self.args.source_bpe_end_marker, ""), ) print( "translated sentence:", translated_stc.replace(self.args.source_bpe_end_marker, ""), ) print( "reconstructed sentence:", recon_stc.replace(self.args.source_bpe_end_marker, ""), ) print("reward:", backward_reward) print("--------") total_reward = (self.alpha * forward_reward + (1.0 - self.alpha) * backward_reward) src_processed = self._maybe_reverse_source(src) tgt_hypo = self._maybe_add_eos(tgt_hypo, tgt_dict.eos()) forward_samples.append({ "id": dual_sample_id, "source": src_processed.cpu(), "target": tgt_hypo.cpu(), # first hypo is best hypo "weight": total_reward, }) # Now combine pseudo labelled examples to corresponding batch with # rewards factored to weighting of each task's loss agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {} forward_model.train() forward_loss, sample_size, logging_output = self.task.criterion( forward_model, utils.move_to_cuda( WeightedLanguagePairDataset.collate( samples=forward_samples, pad_idx=tgt_dict.pad(), eos_idx=tgt_dict.eos(), )), ) agg_loss += forward_loss.detach().item() agg_sample_size += sample_size agg_logging_output["primal"] = logging_output # grad would be further scaled when passed back to trainer, # which will do the update forward_optimizer.backward(forward_loss) backward_model.train() backward_loss, sample_size, logging_output = self.task.criterion( backward_model, bwd_model_input) agg_loss += backward_loss.data.item() agg_sample_size += sample_size agg_logging_output["dual"] = logging_output backward_optimizer.backward(backward_loss) return agg_loss, agg_sample_size, agg_logging_output