def compute_gleu(source, references, prediction_path): """get sentence-level gleu scores""" sys.stderr.write('Running GLEU...\n') gleu_calculator = GLEU(4) gleu_calculator.load_sources(source) num_iterations = 200 gleu_calculator.load_references(references) return np.array( [float(g[0]) for g in gleu_calculator.run_iterations(num_iterations=num_iterations, #num_references=len(references), source=source, hypothesis=prediction_path, per_sent=True)])
def gleu_scores(source, references, systems, ngrams_len=4, num_iterations=500, debug=False): # if there is only one reference, just do one iteration if len(references) == 1: num_iterations = 1 gleu_calculator = GLEU(ngrams_len) if isinstance(source, six.string_types): gleu_calculator.load_sources(source) else: gleu_calculator.set_sources(source) if isinstance(references[0], six.string_types): gleu_calculator.load_references(references) else: gleu_calculator.set_references(references) total = [] per_sentence = [] for hpath in systems: if isinstance(hpath, six.string_types): with open(hpath) as instream: hyp = [line.split() for line in instream] if not debug: print(os.path.basename(hpath), ) else: instream = hpath hyp = [line.split() for line in instream] # first generate a random list of indices, using a different seed # for each iteration indices = [] for j in range(num_iterations): random.seed(j * 101) indices.append([ random.randint(0, len(references) - 1) for i in range(len(hyp)) ]) if debug: print() print('===== Sentence-level scores =====') print('SID Mean Stdev 95%CI GLEU') iter_stats = [[0 for i in range(2 * ngrams_len + 2)] for j in range(num_iterations)] for i, h in enumerate(hyp): gleu_calculator.load_hypothesis_sentence(h) # we are going to store the score of this sentence for each ref # so we don't have to recalculate them 500 times stats_by_ref = [None for r in range(len(references))] for j in range(num_iterations): ref = indices[j][i] this_stats = stats_by_ref[ref] if this_stats is None: this_stats = [ s for s in gleu_calculator.gleu_stats(i, r_ind=ref) ] stats_by_ref[ref] = this_stats iter_stats[j] = [ sum(scores) for scores in zip(iter_stats[j], this_stats) ] per_sentence.append( get_gleu_stats([ gleu_calculator.gleu(stats, smooth=True) for stats in stats_by_ref ])) if debug: # sentence-level GLEU is the mean GLEU of the hypothesis # compared to each reference for r in range(len(references)): if stats_by_ref[r] is None: stats_by_ref[r] = [ s for s in gleu_calculator.gleu_stats(i, r_ind=r) ] print(i, ) print(' '.join(per_sentence[-1])) total.append( get_gleu_stats( [gleu_calculator.gleu(stats) for stats in iter_stats])) if debug: print('\n==== Overall score =====') print('Mean Stdev 95%CI GLEU') print(' '.join(total[-1])) else: print("total", total[-1][0]) return total, per_sentence
def main(args): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task, model_arg_overrides=eval( args.model_overrides)) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() if args.score_reference: translator = SequenceScorer(models, task.target_dictionary) else: translator = SequenceGenerator( models, task.target_dictionary, beam_size=args.beam, minlen=args.min_len, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, sampling_temperature=args.sampling_temperature, diverse_beam_groups=args.diverse_beam_groups, diverse_beam_strength=args.diverse_beam_strength, ) if use_cuda: translator.cuda() # Initialize fluency scorer (and language model) fluency_scorer = FluencyScorer(args.lang_model_path, args.lang_model_data) # Generate and compute BLEU score scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) # Save all sources, targets and hypothesis to compute GLEU score sources = [] targets = [] hypoths = [] num_sentences = 0 has_target = True with progress_bar.build_progress_bar(args, itr) as t: if args.score_reference: translations = translator.score_batched_itr(t, cuda=use_cuda, timer=gen_timer) else: translations = translator.generate_batched_itr( t, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=use_cuda, timer=gen_timer, prefix_size=args.prefix_size, ) wps_meter = TimeMeter() for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and ground truth has_target = target_tokens is not None target_tokens = target_tokens.int().cpu() if has_target else None # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: src_str = src_dict.string(src_tokens, args.remove_bpe) if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) sources.append(src_str) targets.append(target_str) if not args.quiet: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) iteration = 0 curr_src_str = src_str best_fluency_score = fluency_scorer.score_sentence(src_str).item() best_hypo_str = '' # Boost inference while True: hypo_tokens_list = [] hypo_str_list = [] hypo_fluency_score_list = [] # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=curr_src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) hypo_tokens_list.append(hypo_tokens) hypo_str_list.append(hypo_str) hypo_fluency_score = fluency_scorer.score_sentence( hypo_str).item() hypo_fluency_score_list.append(hypo_fluency_score) if not args.quiet: # print('H-{}\t{}\t{}'.format(sample_id, hypo['score'], hypo_str)) print('H-{}\t{}\t{}'.format(sample_id, hypo_str, hypo['score'])) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) print('F-{}\t{}'.format(sample_id, hypo_fluency_score)) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Compare best scores max_fluency_score = max(hypo_fluency_score_list) max_idx = hypo_fluency_score_list.index(max_fluency_score) max_hypo_str = hypo_str_list[max_idx] if max_fluency_score <= best_fluency_score: # Score only the top hypothesis if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tokenizer.Tokenizer.tokenize( target_str, tgt_dict, add_if_not_exist=True) max_tokens = hypo_tokens_list[max_idx] scorer.add(target_tokens, max_tokens) hypoths.append(max_hypo_str) break else: # Keep boosting iteration = iteration + 1 curr_src_str = max_hypo_str best_fluency_score = max_fluency_score best_hypo_str = max_hypo_str wps_meter.update(src_tokens.size(0)) t.log({'wps': round(wps_meter.avg)}) num_sentences += 1 print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string())) # compute GLEU gleu_calculator = GLEU(args.n) gleu_calculator.load_text_sources(sources) gleu_calculator.load_text_references([targets]) gleu_scores = gleu_calculator.run_iterations(num_iterations=args.iter, hypothesis=hypoths, per_sent=args.sent) gleu_score = [g for g in gleu_scores][0][0] * 100 print('| Generate {} with beam={}: GLEU = {:2.2f}'.format( args.gen_subset, args.beam, gleu_score))
help='path to src sentences') parser.add_argument('-r', '--ref', nargs='*', required=True, help='references to use') parser.add_argument('-d', '--debug', default=False, action='store_true', help='print debugging messages') parser.add_argument('-c', '--cand', nargs='*', required=True, help='candidate(s) to score') args = parser.parse_args() gleu_calculator = GLEU(4) gleu_calculator.load_sources(args.src) num_iterations = 200 gleu_calculator.load_references(args.ref) for cand in args.cand: print cand, [ float(g[0]) for g in gleu_calculator.run_iterations( num_iterations=num_iterations, source=args.src, hypothesis=cand, per_sent=False) ][0]
def make_result(src_str, hypos, tgt_str='', iteration=0): results = [] # compute fluency score for source string # the source string itself is an entry result0 = Correction() result0.iteration = iteration result0.src_str = result0.hypo_str = src_str fluency_scores = fluency_scorer.score_sentence(src_str).item() result0.fluency_scores = fluency_scores result0.fluency_scores_str = "Fluency Score: {:0.4f}".format( fluency_scores) results.append(result0) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: result = Correction() result.iteration = iteration + 1 result.src_str = src_str hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) # result.hypos.append('H\t{}\t{}'.format(hypo['score'], hypo_str)) result.hypo_str = hypo_str result.hypo_score = result.hypo_score_str = hypo['score'] result.pos_scores_str = 'P\t{}'.format(' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), ))) result.alignments_str = ('A\t{}'.format(' '.join( map(lambda x: str(utils.item(x)), alignment))) if args.print_alignment else None) # compute GLEU if target is provided if tgt_str: gleu_calculator = GLEU(args.n) gleu_calculator.load_text_sources([src_str]) gleu_calculator.load_text_references([[tgt_str]]) gleu_scores = gleu_calculator.run_iterations( num_iterations=args.iter, hypothesis=[hypo_str], per_sent=args.sent) gleu_score = [g for g in gleu_scores][0][0] * 100 result.gleu_scores = gleu_score result.gleu_scores_str = 'GLEU {:2.2f}'.format(gleu_score) else: result.gleu_scores_str = 'GLEU N/A (no target was provided. use format "source sentence|target setence" to provide a target/reference)' # compute fluency score fluency_scores = fluency_scorer.score_sentence(hypo_str).item() result.fluency_scores = fluency_scores result.fluency_scores_str = "Fluency Score: {:0.4f}".format( fluency_scores) results.append(result) return results
default=False, action="store_true") parser.add_argument('--iter', type=int, default=500, help='the number of iterations to run') args = parser.parse_args() num_iterations = args.iter # if there is only one reference, just do one iteration if len(args.reference) == 1: num_iterations = 1 gleu_calculator = GLEU(args.n) gleu_calculator.load_sources(args.source) gleu_calculator.load_references(args.reference) for hpath in args.hypothesis: instream = sys.stdin if hpath == '-' else open(hpath) hyp = [line.split() for line in instream] if not args.debug: print os.path.basename(hpath), # first generate a random list of indices, using a different seed # for each iteration indices = [] for j in range(num_iterations):