def main(): parser = argparse.ArgumentParser(description="Evaluation") parser.add_argument('--mtevaldir',type=str, help="Path to MT evaluation scripts",action='store',default="") parser.add_argument('--ref',type=str,help='Reference file', action='store',required=True) parser.add_argument('--out',type=str,help='Output file', action='store',required=True) parser.add_argument('--workdir','-w',type=str,help='Work directory', action='store',default=".") parser.add_argument('-i',dest='casesensitive',help='Measure translation accuracy without regard for case',action='store_false',default=True) parser.add_argument('-a',dest='oof',help='Out of five evaluation, considers up to four additional alternatives in system output',action='store_true',default=False) #parser.add_argument('-C',dest='forcecontext',help='Force context from input, even if system-supplied context is different',action='store_true',default=False) parser.add_argument('-I',dest='ignoreinputmismatch',help='Ignore input mismatch',action='store_true',default=False) args = parser.parse_args() totalavgaccuracy, totalwordavgaccuracy, totalavgrecall, matrexsrcfile, matrextgtfile, matrexoutfile = evaluate(Reader(args.ref), Reader(args.out), args.mtevaldir, args.workdir, args.casesensitive, args.oof, args.ignoreinputmismatch) outprefix = '.'.join(args.out.split('.')[:-1]) if args.mtevaldir: mtscore(args.mtevaldir, matrexsrcfile, matrextgtfile, matrexoutfile, totalavgaccuracy, totalwordavgaccuracy, totalavgrecall, outprefix, args.workdir) else: f = io.open(outprefix + '.summary.score','w') s = "Accuracy Word-Accuracy Recall" f.write(s+ "\n") log(s) s = str(totalavgaccuracy) + " " + str(totalwordavgaccuracy) + " " + str(totalavgrecall) f.write(s + "\n") log(s) f.close()
def mtscore(mtevaldir, sourcexml, refxml, targetxml, totalavgaccuracy, totalwordavgaccuracy, totalavgrecall, outprefix, WORKDIR='.'): per = 0 wer = 0 bleu = 0 meteor = 0 nist = 0 ter = 0 EXEC_MATREX_WER = mtevaldir + '/eval/WER_v01.pl' EXEC_MATREX_PER = mtevaldir + '/eval/PER_v01.pl' EXEC_MATREX_BLEU = mtevaldir + '/eval/bleu-1.04.pl' EXEC_MATREX_METEOR = mtevaldir + '/meteor-0.6/meteor.pl' EXEC_MATREX_MTEVAL = mtevaldir + '/mteval-v11b.pl' EXEC_MATREX_TER = mtevaldir + '/tercom.jar' EXEC_PERL = 'perl' EXEC_JAVA = 'java' errors = False if EXEC_MATREX_BLEU and os.path.exists(EXEC_MATREX_BLEU): if not runcmd( EXEC_PERL + ' ' + EXEC_MATREX_BLEU + " -r " + refxml + ' -t ' + targetxml + ' -s ' + sourcexml + ' -ci > ' + outprefix + '.bleu.score', 'Computing BLEU score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.bleu.score') for line in f: if line[0:9] == "BLEUr1n4,": bleu = float(line[10:].strip()) print("BLEU score: ", bleu, file=sys.stderr) f.close() except Exception as e: log("Error reading bleu.score:" + str(e), red) errors = True else: log("Skipping BLEU (no script found [" + EXEC_MATREX_BLEU + "])", yellow) if EXEC_MATREX_WER and os.path.exists(EXEC_MATREX_WER): if not runcmd( EXEC_PERL + ' ' + EXEC_MATREX_WER + " -r " + refxml + ' -t ' + targetxml + ' -s ' + sourcexml + ' > ' + outprefix + '.wer.score', 'Computing WER score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.wer.score', 'r', encoding='utf-8') for line in f: if line[0:11] == "WER score =": wer = float(line[12:19].strip()) log("WER score: " + str(wer), white) f.close() except Exception as e: log("Error reading wer.score:" + str(e), red) errors = True else: log("Skipping WER (no script found [" + EXEC_MATREX_WER + "]) ", yellow) if EXEC_MATREX_PER and os.path.exists(EXEC_MATREX_PER): if not runcmd( EXEC_PERL + ' ' + EXEC_MATREX_PER + " -r " + refxml + ' -t ' + targetxml + ' -s ' + sourcexml + ' > ' + outprefix + '.per.score', 'Computing PER score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.per.score', 'r', encoding='utf-8') for line in f: if line[0:11] == "PER score =": per = float(line[12:19].strip()) log("PER score: " + str(per), white) f.close() except Exception as e: log("Error reading per.score" + str(e), red) errors = True else: log("Skipping PER (no script found [" + EXEC_MATREX_PER + "])", yellow) if EXEC_MATREX_METEOR and os.path.exists(EXEC_MATREX_METEOR): if not runcmd( EXEC_PERL + ' -I ' + os.path.dirname(EXEC_MATREX_METEOR) + ' ' + EXEC_MATREX_METEOR + " -s colibrita -r " + refxml + ' -t ' + targetxml + ' --modules "exact" > ' + outprefix + '.meteor.score', 'Computing METEOR score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.meteor.score', 'r', encoding='utf-8') for line in f: if line[0:6] == "Score:": meteor = float(line[7:].strip()) log("METEOR score: " + str(meteor), white) f.close() except Exception as e: log("Error reading meteor.score:" + str(e), red) errors = True else: log("Skipping METEOR (no script found [" + EXEC_MATREX_METEOR + "])", yellow) if EXEC_MATREX_MTEVAL and os.path.exists(EXEC_MATREX_MTEVAL): if not runcmd( EXEC_PERL + ' ' + EXEC_MATREX_MTEVAL + " -r " + refxml + ' -t ' + targetxml + ' -s ' + sourcexml + ' > ' + outprefix + '.mteval.score', 'Computing NIST & BLEU scores'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.mteval.score', 'r', encoding='utf-8') for line in f: if line[0:12] == "NIST score =": nist = float(line[13:21].strip()) log("NIST score: ", nist) if line[21:33] == "BLEU score =": try: bleu2 = float(line[34:40].strip()) if bleu == 0: bleu = bleu2 log("BLEU score: " + str(bleu), white) elif abs(bleu - bleu2) > 0.01: log("blue score from MTEVAL scripts differs too much: " + str(bleu) + " vs " + str(bleu2) + ", choosing highest score") if bleu2 > bleu: bleu = bleu2 else: log("BLEU score (not stored): " + str(float(line[34:40].strip()))) except: raise f.close() except Exception as e: log("Error reading mteval.score: " + str(e), red) errors = True else: log("Skipping MTEVAL (BLEU & NIST) (no script found)", yellow) if EXEC_MATREX_TER and os.path.exists(EXEC_MATREX_TER): if not runcmd( EXEC_JAVA + ' -jar ' + EXEC_MATREX_TER + " -r " + refxml + ' -h ' + targetxml + ' > ' + outprefix + '.ter.score', 'Computing TER score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.ter.score', 'r', encoding='utf-8') for line in f: if line[0:10] == "Total TER:": ter = float(line[11:].strip().split(' ')[0]) log("TER score: ", ter, white) f.close() except Exception as e: log("Error reading ter.score: " + str(e), red) else: log("Skipping TER (no script found)", yellow) log("SCORE SUMMARY\n===================\n") f = io.open(WORKDIR + '/' + outprefix + '.summary.score', 'w') s = "Accuracy Word-Accuracy Recall BLEU METEOR NIST TER WER PER" f.write(s + "\n") log(s) s = str(totalavgaccuracy) + " " + str(totalwordavgaccuracy) + " " + str( totalavgrecall) + " " + str(bleu) + " " + str(meteor) + " " + str( nist) + " " + str(ter) + " " + str(wer) + " " + str(per) f.write(s + "\n") log(s) f.close() return not errors
def main(): parser = argparse.ArgumentParser(description="Evaluation") parser.add_argument('--mtevaldir', type=str, help="Path to MT evaluation scripts", action='store', default="") parser.add_argument('--ref', type=str, help='Reference file', action='store', required=True) parser.add_argument('--out', type=str, help='Output file', action='store', required=True) parser.add_argument('--workdir', '-w', type=str, help='Work directory', action='store', default=".") parser.add_argument( '-i', dest='casesensitive', help='Measure translation accuracy without regard for case', action='store_false', default=True) parser.add_argument( '-a', dest='oof', help= 'Out of five evaluation, considers up to four additional alternatives in system output', action='store_true', default=False) #parser.add_argument('-C',dest='forcecontext',help='Force context from input, even if system-supplied context is different',action='store_true',default=False) parser.add_argument('-I', dest='ignoreinputmismatch', help='Ignore input mismatch', action='store_true', default=False) args = parser.parse_args() totalavgaccuracy, totalwordavgaccuracy, totalavgrecall, matrexsrcfile, matrextgtfile, matrexoutfile = evaluate( Reader(args.ref), Reader(args.out), args.mtevaldir, args.workdir, args.casesensitive, args.oof, args.ignoreinputmismatch) outprefix = '.'.join(args.out.split('.')[:-1]) if args.mtevaldir: mtscore(args.mtevaldir, matrexsrcfile, matrextgtfile, matrexoutfile, totalavgaccuracy, totalwordavgaccuracy, totalavgrecall, outprefix, args.workdir) else: f = io.open(outprefix + '.summary.score', 'w') s = "Accuracy Word-Accuracy Recall" f.write(s + "\n") log(s) s = str(totalavgaccuracy) + " " + str( totalwordavgaccuracy) + " " + str(totalavgrecall) f.write(s + "\n") log(s) f.close()
def mtscore(mtevaldir, sourcexml, refxml, targetxml, totalavgaccuracy, totalwordavgaccuracy, totalavgrecall, outprefix, WORKDIR = '.'): per = 0 wer = 0 bleu = 0 meteor = 0 nist = 0 ter = 0 EXEC_MATREX_WER = mtevaldir + '/eval/WER_v01.pl' EXEC_MATREX_PER = mtevaldir + '/eval/PER_v01.pl' EXEC_MATREX_BLEU = mtevaldir + '/eval/bleu-1.04.pl' EXEC_MATREX_METEOR = mtevaldir + '/meteor-0.6/meteor.pl' EXEC_MATREX_MTEVAL = mtevaldir + '/mteval-v11b.pl' EXEC_MATREX_TER = mtevaldir + '/tercom.jar' EXEC_PERL = 'perl' EXEC_JAVA = 'java' errors = False if EXEC_MATREX_BLEU and os.path.exists(EXEC_MATREX_BLEU): if not runcmd(EXEC_PERL + ' ' + EXEC_MATREX_BLEU + " -r " + refxml + ' -t ' + targetxml + ' -s ' + sourcexml + ' -ci > ' + outprefix + '.bleu.score', 'Computing BLEU score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.bleu.score') for line in f: if line[0:9] == "BLEUr1n4,": bleu = float(line[10:].strip()) print("BLEU score: ", bleu, file=sys.stderr) f.close() except Exception as e: log("Error reading bleu.score:" + str(e),red) errors = True else: log("Skipping BLEU (no script found ["+EXEC_MATREX_BLEU+"])",yellow) if EXEC_MATREX_WER and os.path.exists(EXEC_MATREX_WER): if not runcmd(EXEC_PERL + ' ' + EXEC_MATREX_WER + " -r " + refxml + ' -t ' + targetxml + ' -s ' + sourcexml + ' > ' + outprefix + '.wer.score', 'Computing WER score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.wer.score','r',encoding='utf-8') for line in f: if line[0:11] == "WER score =": wer = float(line[12:19].strip()) log("WER score: " + str(wer), white) f.close() except Exception as e: log("Error reading wer.score:" + str(e),red) errors = True else: log("Skipping WER (no script found ["+EXEC_MATREX_WER+"]) ",yellow) if EXEC_MATREX_PER and os.path.exists(EXEC_MATREX_PER): if not runcmd(EXEC_PERL + ' ' + EXEC_MATREX_PER + " -r " + refxml + ' -t ' + targetxml + ' -s ' + sourcexml + ' > ' + outprefix + '.per.score', 'Computing PER score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix +'.per.score','r',encoding='utf-8') for line in f: if line[0:11] == "PER score =": per = float(line[12:19].strip()) log("PER score: " + str(per), white) f.close() except Exception as e: log("Error reading per.score" + str(e),red) errors = True else: log("Skipping PER (no script found ["+EXEC_MATREX_PER+"])",yellow) if EXEC_MATREX_METEOR and os.path.exists(EXEC_MATREX_METEOR): if not runcmd(EXEC_PERL + ' -I ' + os.path.dirname(EXEC_MATREX_METEOR) + ' ' + EXEC_MATREX_METEOR + " -s colibrita -r " + refxml + ' -t ' + targetxml + ' --modules "exact" > ' + outprefix + '.meteor.score', 'Computing METEOR score'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.meteor.score','r',encoding='utf-8') for line in f: if line[0:6] == "Score:": meteor = float(line[7:].strip()) log("METEOR score: " + str(meteor), white) f.close() except Exception as e: log("Error reading meteor.score:" + str(e),red) errors = True else: log("Skipping METEOR (no script found ["+EXEC_MATREX_METEOR+"])",yellow) if EXEC_MATREX_MTEVAL and os.path.exists(EXEC_MATREX_MTEVAL): if not runcmd(EXEC_PERL + ' ' + EXEC_MATREX_MTEVAL + " -r " + refxml + ' -t ' + targetxml + ' -s ' + sourcexml + ' > ' + outprefix + '.mteval.score', 'Computing NIST & BLEU scores'): errors = True if not errors: try: f = io.open(WORKDIR + '/' + outprefix + '.mteval.score','r',encoding='utf-8') for line in f: if line[0:12] == "NIST score =": nist = float(line[13:21].strip()) log("NIST score: ", nist) if line[21:33] == "BLEU score =": try: bleu2 = float(line[34:40].strip()) if bleu == 0: bleu = bleu2 log("BLEU score: " + str(bleu), white) elif abs(bleu - bleu2) > 0.01: log("blue score from MTEVAL scripts differs too much: " + str(bleu) + " vs " + str(bleu2) + ", choosing highest score") if bleu2 > bleu: bleu = bleu2 else: log("BLEU score (not stored): " + str(float(line[34:40].strip()))) except: raise f.close() except Exception as e: log("Error reading mteval.score: " + str(e),red) errors = True else: log("Skipping MTEVAL (BLEU & NIST) (no script found)", yellow) if EXEC_MATREX_TER and os.path.exists(EXEC_MATREX_TER): if not runcmd(EXEC_JAVA + ' -jar ' + EXEC_MATREX_TER + " -r " + refxml + ' -h ' + targetxml + ' > ' + outprefix + '.ter.score', 'Computing TER score'): errors = True if not errors: try: f = io.open(WORKDIR +'/' + outprefix + '.ter.score','r',encoding='utf-8') for line in f: if line[0:10] == "Total TER:": ter = float(line[11:].strip().split(' ')[0]) log("TER score: ", ter,white) f.close() except Exception as e: log("Error reading ter.score: " + str(e),red) else: log("Skipping TER (no script found)",yellow) log("SCORE SUMMARY\n===================\n") f = io.open(WORKDIR + '/' + outprefix + '.summary.score','w') s = "Accuracy Word-Accuracy Recall BLEU METEOR NIST TER WER PER" f.write(s+ "\n") log(s) s = str(totalavgaccuracy) + " " + str(totalwordavgaccuracy) + " " + str(totalavgrecall) + " " + str(bleu) + " " + str(meteor) + " " + str(nist) + " " + str(ter) + " " + str(wer) + " " + str(per) f.write(s + "\n") log(s) f.close() return not errors