def main(): parser = get_parser() args = parser.parse_args() print(args) assert args.sys == '-' or os.path.exists(args.sys), \ "System output file {} does not exist".format(args.sys) assert os.path.exists(args.ref), \ "Reference file {} does not exist".format(args.ref) dict = dictionary.Dictionary() def readlines(fd): for line in fd.readlines(): if args.ignore_case: yield line.lower() else: yield line def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): sys_tok = tokenizer.Tokenizer.tokenize(sys_tok, dict) ref_tok = tokenizer.Tokenizer.tokenize(ref_tok, dict) scorer.add(ref_tok, sys_tok) print(scorer.result_string(args.order)) if args.sys == '-': score(sys.stdin) else: with open(args.sys, 'r') as f: score(f)
def main(): parser = argparse.ArgumentParser( description='Command-line script for BLEU scoring.') parser.add_argument('-s', '--sys', default='-', help='system output') parser.add_argument('-r', '--ref', required=True, help='references') parser.add_argument('-o', '--order', default=4, metavar='N', type=int, help='consider ngrams up to this order') parser.add_argument('--ignore-case', action='store_true', help='case-insensitive scoring') parser.add_argument( '--tokenizer_name', metavar='N', default='default', choices=['default', 'nltk', 'sacremoses'], help= "Which tokenizer to use. Choices are default, nltk, sacremoses. default tokenizes by splitting on white space. nltk uses " "nltk's word_tokenize which better takes into account punctuation. As an example " "'Hello, how's your day today?' would be tokenized as " "['Hello,' , 'how's', 'your', 'day', 'today?'] when using the default, but would instead be tokenized as " "['Hello', ',', 'how', ''s', 'your', 'day', 'today', '?'] when using nltk. The sacremoses tokenizer is from this package, " "https://github.com/alvations/sacremoses.") args = parser.parse_args() print(args) assert args.sys == '-' or os.path.exists(args.sys), \ "System output file {} does not exist".format(args.sys) assert os.path.exists(args.ref), \ "Reference file {} does not exist".format(args.ref) dict = dictionary.Dictionary() tokenizer_tool = tokenizer.build_tokenizer(args) def readlines(fd): for line in fd.readlines(): if args.ignore_case: yield line.lower() yield line def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): sys_tok = tokenizer_tool.tokenize(sys_tok, dict) ref_tok = tokenizer_tool.tokenize(ref_tok, dict) scorer.add(ref_tok, sys_tok) print(scorer.result_string(args.order)) if args.sys == '-': score(sys.stdin) else: with open(args.sys, 'r') as f: score(f)
def build_dictionary(filenames): if args.singleSeq: d = dictionary.Dictionary() else: d = dictionaryWCS.DictionaryWCS() for filename in filenames: tokenizer.Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, args.L) return d
def main(): parser = get_parser() args = parser.parse_args() print(args) assert args.sys == "-" or os.path.exists( args.sys), "System output file {} does not exist".format(args.sys) assert os.path.exists(args.ref), "Reference file {} does not exist".format( args.ref) dict = dictionary.Dictionary() def readlines(fd): for line in fd.readlines(): if args.ignore_case: yield line.lower() else: yield line if args.sacrebleu: import sacrebleu def score(fdsys): with open(args.ref) as fdref: print(sacrebleu.corpus_bleu(fdsys, [fdref])) elif args.sentence_bleu: def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for i, (sys_tok, ref_tok) in enumerate( zip(readlines(fdsys), readlines(fdref))): scorer.reset(one_init=True) sys_tok = dict.encode_line(sys_tok) ref_tok = dict.encode_line(ref_tok) scorer.add(ref_tok, sys_tok) print(i, scorer.result_string(args.order)) else: def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): sys_tok = dict.encode_line(sys_tok) ref_tok = dict.encode_line(ref_tok) scorer.add(ref_tok, sys_tok) print(scorer.result_string(args.order)) if args.sys == "-": score(sys.stdin) else: with open(args.sys, "r") as f: score(f)
def score(fdsys, tofile,refFile): dict = dictionary.Dictionary() with open(refFile) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): sys_tok = dict.encode_line(sys_tok) ref_tok = dict.encode_line(ref_tok) scorer.add(ref_tok, sys_tok) print(scorer.result_string(args_score.order)) with open(tofile, 'a') as f: f.write(scorer.result_string(args_score.order) + '\r\n')
def build_nstack_source_dictionary(_src_file): d = dictionary.Dictionary() print(f'Build dict on src_file: {_src_file}') NstackTreeTokenizer.acquire_vocab_multithread( _src_file, d, tokenize_line, num_workers=args.workers, remove_root=remove_root, take_pos_tag=take_pos_tag, take_nodes=take_nodes, no_collapse=no_collapse, ) d.finalize( threshold=args.thresholdsrc if src else args.thresholdtgt, nwords=args.nwordssrc if src else args.nwordstgt, padding_factor=args.padding_factor ) print(f'Finish building src vocabulary: size {len(d)}') return d
def main(): parser = argparse.ArgumentParser( description='Command-line script for BLEU scoring.') parser.add_argument('-s', '--sys', default='-', help='system output') parser.add_argument('-r', '--ref', required=True, help='references') parser.add_argument('-o', '--order', default=4, metavar='N', type=int, help='consider ngrams up to this order') parser.add_argument('--ignore-case', action='store_true', help='case-insensitive scoring') args = parser.parse_args() print(args) assert args.sys == '-' or os.path.exists(args.sys), \ "System output file {} does not exist".format(args.sys) assert os.path.exists(args.ref), \ "Reference file {} does not exist".format(args.ref) dict = dictionary.Dictionary() def readlines(fd): for line in fd.readlines(): if args.ignore_case: yield line.lower() else: yield line def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): sys_tok = tokenizer.Tokenizer.tokenize(sys_tok, dict) ref_tok = tokenizer.Tokenizer.tokenize(ref_tok, dict) scorer.add(ref_tok, sys_tok) print(scorer.result_string(args.order)) if args.sys == '-': score(sys.stdin) else: with open(args.sys, 'r') as f: score(f)
parser.add_argument('--sys', nargs='*', default='', metavar='FILE', help='path to system output') parser.add_argument('--ref', default='', metavar='FILE', help='path to references') parser.add_argument('--output', default='', metavar='FILE', help='print outputs into a pretty format') args = parser.parse_args() dict = dictionary.Dictionary() scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) def dictolist(d): a = sorted(d.items(), key=lambda i: i[0]) return [i[1] for i in a] def load_sys(paths): src, tgt, hypos, log_probs = {}, {}, {}, {} for path in paths: with open(path) as f: for line in f: if line.startswith(('S-', 'T-', 'H-')): i = int(line[line.find('-') + 1:line.find('\t')])
def score(args, trainer, dataset, src_dict, tgt_dict, ref_file): begin = time.time() src_dict = deepcopy( src_dict) # This is necessary, generation of translations tgt_dict = deepcopy( tgt_dict ) # alters target dictionary messing up with the rest of training model = trainer.get_model() # Initialize data iterator itr = data.EpochBatchIterator( dataset=dataset, max_tokens=None, max_sentences=max( 8, min(math.ceil(1024 / args.distributed_world_size), 128)), max_positions=args.max_positions, ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() translator = SequenceGenerator( [model], tgt_dict.get_metadata(), maxlen=args.max_target_positions - 1, #do not include EOS token beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, ) # Generate and compute BLEU dict = dictionary.Dictionary() num_sentences = 0 predictions = [] translations = translator.generate_batched_itr( itr, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=True, timer=gen_timer, prefix_size=args.prefix_size, ) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and grount truth target_tokens = target_tokens.int().cpu() src_str = src_dict.string(src_tokens, args.remove_bpe) target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe) # Score only the top hypothesis if i == 0: if args.sentencepiece: hypo_str = hypo_str.replace(' ', '').replace('▁', ' ') target_str = target_str.replace(' ', '').replace('▁', ' ') sys_tok = tokenizer.Tokenizer.tokenize( (hypo_str.lower() if not args.test_cased_bleu else hypo_str), dict) ref_tok = tokenizer.Tokenizer.tokenize( (target_str.lower() if not args.test_cased_bleu else target_str), dict) if not args.sentencepiece: hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de') predictions.append('{}\t{}'.format(sample_id, hypo_str)) num_sentences += 1 if args.distributed_world_size > 1: predictions = _all_gather_predictions(predictions) with open(os.path.join(args.data, ref_file), 'r') as reference: refs = [reference.readlines()] #reducing indexed predictions as strings is more memory efficient than reducing tuples predictions = [tuple(item.split('\t')) for item in predictions] predictions = [(int(item[0]), item[1]) for item in predictions] predictions.sort(key=lambda tup: tup[0]) predictions = [ hypo[1] + ('\n' if hypo[1][-1] != '\n' else '') for hypo in predictions ] sacrebleu_score = sacrebleu.corpus_bleu( predictions, refs, lowercase=not args.test_cased_bleu).score if args.save_predictions: os.makedirs(os.path.join(args.save_dir, 'predictions'), exist_ok=True) with open( os.path.join( args.save_dir, 'predictions', ref_file + '.pred.update_{}'.format(trainer._num_updates)), 'w') as f: f.write(''.join(predictions)) DLLogger.log(step=trainer.get_num_updates(), data={ 'inference tokens/s': float(args.distributed_world_size) / gen_timer.avg }, verbosity=0) DLLogger.flush() if gen_timer.sum != 0: print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(len(predictions), gen_timer.n, gen_timer.sum, len(predictions) / gen_timer.sum, float(args.distributed_world_size) / gen_timer.avg)) print('| Eval completed in: {:.2f}s | {}CASED BLEU {:.2f}'.format( time.time() - begin, '' if args.test_cased_bleu else 'UN', sacrebleu_score)) return sacrebleu_score
def build_dictionary(filenames): d = dictionary.Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line) return d
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile, write_hypos, normalize): print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c) gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files( args) dict = dictionary.Dictionary() scorer = scorer = bleu.Scorer( bleu.BleuConfig( pad=dict.pad(), eos=dict.eos(), unk=dict.unk(), )) ordered_hypos = {} ordered_targets = {} for shard_id in range(len(bitext1_lst)): bitext1 = bitext1_lst[shard_id] bitext2 = bitext2_lst[shard_id] gen_output = gen_output_lst[shard_id] lm_res = lm_res_lst[shard_id] total = len(bitext1.rescore_source.keys()) source_lst = [] hypo_lst = [] score_lst = [] reference_lst = [] j = 1 best_score = -math.inf for i in range(total): # length is measured in terms of words, not bpe tokens, since models may not share the same bpe target_len = len(bitext1.rescore_hypo[i].split()) if lm_res is not None: lm_score = lm_res.score[i] else: lm_score = 0 if bitext2 is not None: bitext2_score = bitext2.rescore_score[i] bitext2_backwards = bitext2.backwards else: bitext2_score = None bitext2_backwards = None score = rerank_utils.get_score( a, b, c, target_len, bitext1.rescore_score[i], bitext2_score, lm_score=lm_score, lenpen=lenpen, src_len=bitext1.source_lengths[i], tgt_len=bitext1.target_lengths[i], bitext1_backwards=bitext1.backwards, bitext2_backwards=bitext2_backwards, normalize=normalize, ) if score > best_score: best_score = score best_hypo = bitext1.rescore_hypo[i] if j == gen_output.num_hypos[i] or j == args.num_rescore: j = 1 hypo_lst.append(best_hypo) score_lst.append(best_score) source_lst.append(bitext1.rescore_source[i]) reference_lst.append(bitext1.rescore_target[i]) best_score = -math.inf best_hypo = "" else: j += 1 gen_keys = list(sorted(gen_output.no_bpe_target.keys())) for key in range(len(gen_keys)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[ gen_keys[key]], ("pred and rescore hypo mismatch: i: " + str(key) + ", " + str(hypo_lst[key]) + str(gen_keys[key]) + str(gen_output.no_bpe_hypo[key])) sys_tok = dict.encode_line(hypo_lst[key]) ref_tok = dict.encode_line( gen_output.no_bpe_target[gen_keys[key]]) scorer.add(ref_tok, sys_tok) else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]]) sys_tok = dict.encode_line(full_hypo) ref_tok = dict.encode_line( gen_output.no_bpe_target[gen_keys[key]]) scorer.add(ref_tok, sys_tok) # if only one set of hyper parameters is provided, write the predictions to a file if write_hypos: # recover the orinal ids from n best list generation for key in range(len(gen_output.no_bpe_target)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[ gen_keys[key]], ("pred and rescore hypo mismatch:" + "i:" + str(key) + str(hypo_lst[key]) + str(gen_output.no_bpe_hypo[key])) ordered_hypos[gen_keys[key]] = hypo_lst[key] ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key]] else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]]) ordered_hypos[gen_keys[key]] = full_hypo ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key]] # write the hypos in the original order from nbest list generation if args.num_shards == (len(bitext1_lst)): with open(target_outfile, "w") as t: with open(hypo_outfile, "w") as h: for key in range(len(ordered_hypos)): t.write(ordered_targets[key]) h.write(ordered_hypos[key]) res = scorer.result_string(4) if write_hypos: print(res) score = rerank_utils.parse_bleu_scoring(res) return score
def main(): parser = argparse.ArgumentParser(description=( 'Extract back-translations from the stdout of fairseq-generate. ' 'If there are multiply hypotheses for a source, we only keep the first one. ' )) parser.add_argument('--output', required=True, help='output prefix') parser.add_argument('--srclang', required=True, help='source language (extracted from H-* lines)') parser.add_argument('--tgtlang', required=True, help='target language (extracted from S-* lines)') parser.add_argument('--minlen', type=int, help='min length filter') parser.add_argument('--maxlen', type=int, help='max length filter') parser.add_argument('--ratio', type=float, help='ratio filter') parser.add_argument('files', nargs='*', help='input files') args = parser.parse_args() dict = dictionary.Dictionary() scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) def validate(src, tgt): srclen = len(src.split(' ')) if src != '' else 0 tgtlen = len(tgt.split(' ')) if tgt != '' else 0 if ((args.minlen is not None and (srclen < args.minlen or tgtlen < args.minlen)) or (args.maxlen is not None and (srclen > args.maxlen or tgtlen > args.maxlen)) or (args.ratio is not None and (max(srclen, tgtlen) / float(min(srclen, tgtlen)) > args.ratio))): return False return True def safe_index(toks, index, default): try: return toks[index] except IndexError: return default # pdb.set_trace() with open(args.output + '.' + args.srclang, 'w') as src_h, \ open(args.output + '.' + args.tgtlang, 'w') as tgt_h: for line in tqdm(fileinput.input(args.files)): if line.startswith('S-'): # pdb.set_trace() tgt = safe_index(line.rstrip().split('\t'), 1, '') elif line.startswith('T-'): gt = safe_index(line.rstrip().split('\t'), 1, '') elif line.startswith('H-'): if tgt is not None: if gt is not None: # pdb.set_trace() src = safe_index(line.rstrip().split('\t'), 2, '') # sent_score = -float(safe_index(line.rstrip().split('\t'), 1, '')) # print(str(sent_score) + "#####" + src) # scorer.reset(one_init=True) # gt_tok = dict.encode_line(gt) # tgt_tok = dict.encode_line(tgt) # scorer.add(gt_tok, tgt_tok) # out = scorer.result_string(1) # print(out+' ' + src + ' ' + gt) # pdb.set_trace() # out = float(out.split(' ')[2].split(',')[0]) # out = float(out.split('/')[2]) # if out < 5: # tgt = gt if validate(src, tgt): print(src, file=src_h) print(tgt, file=tgt_h) else: if validate(gt, tgt): print(gt, file=src_h) print(tgt, file=tgt_h) # print("##" + str(max(len(tgt.split(' ')), len(gt.split(' '))) / float(min(len(tgt.split(' ')), len(gt.split(' '))))) + "##" + str(max(len(tgt.split(' ')), len(src.split(' '))) / float(min(len(tgt.split(' ')), len(src.split(' ')))))) # print(src, file=src_h) # print(gt, file=tgt_h) tgt = None gt = None
def build_dictionary(filenames, src_lang=None, trg_lang=None): d = dictionary.Dictionary(src_lang=src_lang, trg_lang=src_lang) for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line) return d
def score(args, trainer, task, epoch_itr, subset): begin = time.time() if not subset in task.datasets.keys(): task.load_dataset(subset) src_dict = deepcopy(task.source_dictionary ) # This is necessary, generation of translations tgt_dict = deepcopy( task.target_dictionary ) # alters target dictionary messing up with the rest of training model = trainer.get_model() # Initialize data iterator itr = data.EpochBatchIterator( dataset=task.dataset(subset), max_tokens=None, max_sentences=max( 8, min(math.ceil(1024 / args.distributed_world_size), 128)), max_positions=model.max_positions(), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() translator = SequenceGenerator( [model], tgt_dict, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, ) # Generate and compute BLEU dict = dictionary.Dictionary() scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) num_sentences = 0 has_target = True predictions = [] with progress_bar.build_progress_bar(args, itr) as progress: translations = translator.generate_batched_itr( progress, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=True, timer=gen_timer, prefix_size=args.prefix_size, ) wps_meter = TimeMeter() for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and grount truth has_target = target_tokens is not None target_tokens = target_tokens.int().cpu() if has_target else None src_str = src_dict.string(src_tokens, args.remove_bpe) if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe) # Score only the top hypothesis if has_target and i == 0: if args.sentencepiece: hypo_str = hypo_str.replace(' ', '').replace('▁', ' ') target_str = target_str.replace(' ', '').replace('▁', ' ') sys_tok = tokenizer.Tokenizer.tokenize( (hypo_str.lower() if args.ignore_case else hypo_str), dict) ref_tok = tokenizer.Tokenizer.tokenize( (target_str.lower() if args.ignore_case else target_str), dict) scorer.add(ref_tok, sys_tok) if not args.sentencepiece: hypo_str = tokenizer.Tokenizer.detokenize( hypo_str, 'de') predictions.append('{}\t{}'.format(sample_id, hypo_str)) wps_meter.update(src_tokens.size(0)) progress.log({'wps': round(wps_meter.avg)}) num_sentences += 1 if args.distributed_world_size > 1: _all_gather_bleu_scorer(scorer) predictions = _all_gather_predictions(predictions) with open(os.path.join(args.data, 'sacrebleu_reference.de'), 'r') as reference: refs = [reference.readlines()] #reducing indexed predictions as strings is more memory efficient than reducing tuples predictions = [tuple(item.split('\t')) for item in predictions] predictions = [(int(item[0]), item[1]) for item in predictions] predictions.sort(key=lambda tup: tup[0]) predictions = [ hypo[1] + ('\n' if hypo[-1] != '\n' else '') for hypo in predictions ] sacrebleu_score = sacrebleu.corpus_bleu(predictions, refs, lowercase=args.ignore_case) print(f'|Detokenized {sacrebleu_score}') if gen_timer.sum != 0: print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(subset, args.beam, scorer.result_string())) print('| Eval completed in: {:.2f}s'.format(time.time() - begin)) return scorer.score(order=4), sacrebleu_score.score
def score(args, trainer, task, epoch_itr, subset): begin = time.time() if not subset in task.datasets.keys(): task.load_dataset(subset) src_dict = deepcopy(task.source_dictionary ) # This is necessary, generation of translations tgt_dict = deepcopy( task.target_dictionary ) # alters target dictionary messing up with the rest of training model = trainer.get_model() #mlperf_log.transformer_print(key=mlperf_log.EVAL_SIZE, value=task.dataset(subset).__len__()) # Initialize data iterator itr = data.EpochBatchIterator( dataset=task.dataset(subset), max_tokens=None, max_sentences=max( 8, min(math.ceil(1024 / args.distributed_world_size), 128)), max_positions=model.max_positions(), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() translator = SequenceGenerator( [model], tgt_dict, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, ) # Generate and compute BLEU dict = dictionary.Dictionary() scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) num_sentences = 0 has_target = True if args.log_translations: log = open( os.path.join( args.save_dir, 'translations_epoch{}_{}'.format(epoch_itr.epoch, args.distributed_rank)), 'w+') with progress_bar.build_progress_bar(args, itr) as progress: translations = translator.generate_batched_itr( progress, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=True, timer=gen_timer, prefix_size=args.prefix_size, ) wps_meter = TimeMeter() for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and grount truth has_target = target_tokens is not None target_tokens = target_tokens.int().cpu() if has_target else None src_str = src_dict.string(src_tokens, args.remove_bpe) if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if args.log_translations: log.write('S-{}\t{}\n'.format(sample_id, src_str)) if has_target: log.write('T-{}\t{}\n'.format(sample_id, target_str)) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe) if args.log_translations: log.write('H-{}\t{}\t{}\n'.format(sample_id, hypo['score'], hypo_str)) # log.write(str(hypo_tokens)) log.write('P-{}\t{}\n'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) # Score only the top hypothesis if has_target and i == 0: sys_tok = tokenizer.Tokenizer.tokenize( (hypo_str.lower() if args.ignore_case else hypo_str), dict) ref_tok = tokenizer.Tokenizer.tokenize( (target_str.lower() if args.ignore_case else target_str), dict) scorer.add(ref_tok, sys_tok) wps_meter.update(src_tokens.size(0)) progress.log({'wps': round(wps_meter.avg)}) num_sentences += 1 if args.distributed_world_size > 1: _all_gather_bleu_scorer(scorer) if args.log_translations: log.close() if gen_timer.sum != 0: print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if has_target: print('| Generate {} with beam={}: {}'.format(subset, args.beam, scorer.result_string())) print('| Eval completed in: {:.2f}s'.format(time.time() - begin)) return scorer.score(order=4)
def score_target_hypo(args, a, b, c, lenpen, target_outfile, hypo_outfile, score_outfile, lm_outfile, fw_outfile, bw_outfile, write_hypos, normalize): print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c, "target_outfile", target_outfile, "hypo_outfile", hypo_outfile, "lm_outfile", lm_outfile) gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files( args) dict = dictionary.Dictionary() if args.sacrebleu: scorer = bleu.SacrebleuScorer() else: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) ordered_hypos = {} ordered_targets = {} ordered_scores = {} ordered_lm = {} ordered_fw = {} ordered_bw = {} for shard_id in range(len(bitext1_lst)): bitext1 = bitext1_lst[shard_id] bitext2 = bitext2_lst[shard_id] gen_output = gen_output_lst[shard_id] lm_res = lm_res_lst[shard_id] total = len(bitext1.rescore_source.keys()) source_lst = [] hypo_lst = [] score_lst = [] lm_lst = [] fw_lst = [] bw_lst = [] reference_lst = [] j = 1 best_score = -math.inf for i in range(total): # length is measured in terms of words, not bpe tokens, since models may not share the same bpe target_len = len(bitext1.rescore_hypo[i].split()) if lm_res is not None and i in lm_res.score: lm_score = lm_res.score[i] else: lm_score = 0 if bitext2 is not None: bitext2_score = bitext2.rescore_score[i] bitext2_backwards = bitext2.backwards else: bitext2_score = None bitext2_backwards = None score = rerank_utils.get_score(a, b, c, target_len, bitext1.rescore_score[i], bitext2_score, lm_score=lm_score, lenpen=lenpen, src_len=bitext1.source_lengths[i], tgt_len=bitext1.target_lengths[i], bitext1_backwards=bitext1.backwards, bitext2_backwards=bitext2_backwards, normalize=normalize) if score > best_score: best_score = score best_hypo = bitext1.rescore_hypo[i] best_lm = lm_score best_fw = bitext1.rescore_score[i] best_bw = bitext2_score if j == gen_output.num_hypos[i] or j == args.num_rescore: j = 1 hypo_lst.append(best_hypo) score_lst.append(best_score) lm_lst.append(best_lm) fw_lst.append(best_fw) bw_lst.append(best_bw) source_lst.append(bitext1.rescore_source[i]) reference_lst.append(bitext1.rescore_target[i]) best_score = -math.inf best_hypo = "" best_lm = -math.inf else: j += 1 gen_keys = list(sorted(gen_output.no_bpe_target.keys())) for key in range(len(gen_keys)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[ gen_keys[key]], ("pred and rescore hypo mismatch: i: " + str(key) + ", " + str(hypo_lst[key]) + str(gen_keys[key]) + str(gen_output.no_bpe_hypo[key])) sys_tok = dict.encode_line(hypo_lst[key]) ref_tok = dict.encode_line( gen_output.no_bpe_target[gen_keys[key]]) if args.sacrebleu: scorer.add_string(gen_output.no_bpe_target[gen_keys[key]], hypo_lst[key]) else: scorer.add(ref_tok, sys_tok) else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]]) sys_tok = dict.encode_line(full_hypo) ref_tok = dict.encode_line( gen_output.no_bpe_target[gen_keys[key]]) if args.sacrebleu: scorer.add_string(gen_output.no_bpe_target[gen_keys[key]], hypo_lst[key]) else: scorer.add(ref_tok, sys_tok) # if only one set of hyper parameters is provided, write the predictions to a file if write_hypos: # recover the orinal ids from n best list generation for key in range(len(gen_output.no_bpe_target)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], \ "pred and rescore hypo mismatch:"+"i:"+str(key)+str(hypo_lst[key]) + str(gen_output.no_bpe_hypo[key]) ordered_hypos[gen_keys[key]] = hypo_lst[key] ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key]] ordered_scores[gen_keys[key]] = score_lst[key] ordered_lm[gen_keys[key]] = lm_lst[key] ordered_fw[gen_keys[key]] = fw_lst[key] ordered_bw[gen_keys[key]] = bw_lst[key] else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]]) ordered_hypos[gen_keys[key]] = full_hypo ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key]] ordered_scores[gen_keys[key]] = score_lst[key] ordered_lm[gen_keys[key]] = lm_lst[key] ordered_fw[gen_keys[key]] = fw_lst[key] ordered_bw[gen_keys[key]] = bw_lst[key] # print("Target = " + ordered_targets[gen_keys[key]] + " Hypothesis = " + ordered_hypos[gen_keys[key]]) # write the hypos in the original order from nbest list generation # print(ordered_hypos) # print(len(ordered_hypos)) # print(ordered_scores) # print(len(ordered_scores)) if args.num_shards == (len(bitext1_lst)): with open(target_outfile, 'a') as t: with open(hypo_outfile, 'a') as h: with open(score_outfile, 'a') as s: with open(lm_outfile, 'a') as l: with open(fw_outfile, 'a') as f: with open(bw_outfile, 'a') as b: for key in range(len(ordered_hypos)): t.write(ordered_targets[key]) h.write(ordered_hypos[key]) s.write(str(ordered_scores[key]) + "\n") l.write(str(ordered_lm[key]) + "\n") f.write(str(ordered_fw[key]) + "\n") b.write(str(ordered_bw[key]) + "\n") print(scorer) res = scorer.result_string(4) if write_hypos: print(res) if args.sacrebleu: score = res.score else: score = rerank_utils.parse_bleu_scoring(res) return score
def build_dictionary(filenames): d = dictionary.Dictionary() for filename in filenames: dictionary.Dictionary.add_file_to_dictionary(filename, d, tokenize_line, args.workers) return d
def build_dictionary(tokenizer, filenames, max_length=None): d = dictionary.Dictionary() for filename in filenames: tokenizer.add_file_to_dictionary(filename, d) return d