parser.add_argument('--case-insensitive', '-i', action='store_true') if __name__ == '__main__': args = parser.parse_args() if not any([args.all, args.wer, args.ter, args.bleu, args.pyter]): args.all = True if args.all: args.wer = args.ter = args.bleu = True with open(args.source) as src_file, open(args.target) as trg_file: if args.case_insensitive: hypotheses = [line.strip().lower() for line in src_file] references = [line.strip().lower() for line in trg_file] else: hypotheses = [line.strip() for line in src_file] references = [line.strip() for line in trg_file] scores = OrderedDict() if args.bleu: scores['bleu'], _ = corpus_bleu(hypotheses, references) if args.wer: scores['wer'], _ = corpus_wer(hypotheses, references) if args.ter: scores['ter'], _ = corpus_tercom(hypotheses, references) if args.pyter: scores['pyter'], _ = corpus_ter(hypotheses, references) print(' '.join('{}={:.2f}'.format(k, v) for k, v in scores.items()))
references = [remove_punk(line) for line in references] if args.max_size is not None: hypotheses = hypotheses[:args.max_size] references = references[:args.max_size] if len(hypotheses) != len(references): sys.stderr.write( 'warning: source and target don\'t have the same length\n') size = min(len(hypotheses), len(references)) hypotheses = hypotheses[:size] references = references[:size] scores = OrderedDict() if args.bleu: scores['bleu'], summary = corpus_bleu(hypotheses, references) try: scores['penalty'], scores['ratio'] = map( float, re.findall('\w+=(\d+.\d+)', summary)) except ValueError: pass if args.wer: scores['wer'], _ = corpus_wer(hypotheses, references) if args.ter: try: # java missing scores['ter'], _ = corpus_ter(hypotheses, references, tercom_path=tercom_path) except: scores['ter'] = 0 if args.cer:
if len(hypotheses) != len(references): sys.stderr.write( 'warning: source and target don\'t have the same length\n') size = min(len(hypotheses), len(references)) hypotheses = hypotheses[:size] references = references[:size] indices = np.arange(len(hypotheses)) if args.sample_size == 0: args.sample_size = len(hypotheses) bleu_scores = [] hypotheses = np.array(hypotheses) references = np.array(references) for _ in range(args.draws): indices = np.random.randint(len(hypotheses), size=args.sample_size) hypotheses_ = hypotheses[indices] references_ = references[indices] bleu, _ = corpus_bleu(hypotheses_, references_) bleu_scores.append(bleu) bleu_scores = sorted(bleu_scores) k = int(len(bleu_scores) * args.p) // 2 # FIXME bleu_scores = bleu_scores[k:len(bleu_scores) - k] print('[{:.3f}, {:.3f}]'.format(bleu_scores[0], bleu_scores[-1]))
if args.src is None: args.src = args.ref assert args.labels is None or len(args.labels) == len(args.mt) for k, mt in enumerate(args.mt): with open(args.src) as src_file, open(mt) as mt_file, open(args.ref) as ref_file: lines = list(zip(src_file, mt_file, ref_file)) bins = OrderedDict() for i in range(args.min, args.max, args.step): lines_ = [(mt.strip(), ref.strip()) for src, mt, ref in lines if i < len(src.split()) <= i + args.step] if len(lines_) > 0: score, summary = corpus_bleu(*zip(*lines_)) bins[i + args.step] = score # print(i + args.step, '{:.1f}'.format(score), len(lines_), summary) values = np.array(list(bins.values())) keys = np.array(list(bins.keys())) label = args.labels[k] if args.labels else None if args.bar: width = 1 if len(args.mt) > 1 else args.step - 1 keys += k plt.bar(keys + k, values, width=width, label=label) else: plt.plot(keys, values, label=label)