def compute_bleu(net, word_dict, index_dict, tokens, initial=None, IM=None): """ Return BLEU scores for reference tokens For each reference caption, a candidate caption is sampled from net """ bleu_scores = np.zeros((len(tokens), 3)) for i, ref in enumerate(tokens): if initial != None: init = copy.deepcopy(initial) else: init = None ref = ref[net.context:][:-1] if IM != None: can = sample(net, word_dict, index_dict, len(ref), IM[i], initial=init) else: can = sample(net, word_dict, index_dict, len(ref), initial=init) # Compute bleu using n = (1,2,3) n1 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=1), n=1)], n=1) n2 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=2), n=2)], n=2) n3 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=3), n=3)], n=3) bleu_scores[i] = [n1, n2, n3] return bleu_scores
def main(): sys.path.append("../scripts/training/cmert-0.5") import bleu data_dir = "test_scorer_data" nbest_file = os.path.join(data_dir, "nbest.out") ref_file = os.path.join(data_dir, "reference.txt") bleu.preserve_case = False bleu.eff_ref_len = "shortest" bleu.nonorm = 0 ref_fh = open(ref_file) cookedrefs = [] for ref in ref_fh: cookedref = bleu.cook_refs([ref]) cookedrefs.append(cookedref) ref_fh.close() nbest_fh = open(nbest_file) tests = [] i = -1 for line in nbest_fh: fields = line.split("||| ") current_i = int(fields[0]) text = fields[1] if i != current_i: tests.append([]) i = current_i tests[-1].append(text) nbest_fh.close() # score with first best cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) stats = " ".join([ "%d %d" % (c, g) for (c, g) in zip(cookedtest['correct'], cookedtest['guess']) ]) print " %s %d" % (stats, cookedtest['reflen']) cookedtests.append(cookedtest) bleu1 = bleu.score_cooked(cookedtests) # vary, and score again cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] if i == 7: sentence = tests[i][8] elif i == 1: sentences = tests[i][2] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) cookedtests.append(cookedtest) bleu2 = bleu.score_cooked(cookedtests) print "Bleus: ", bleu1, bleu2
def main(): sys.path.append("../scripts/training/cmert-0.5") import bleu data_dir = "test_scorer_data" nbest_file = os.path.join(data_dir,"nbest.out") ref_file = os.path.join(data_dir,"reference.txt") bleu.preserve_case = False bleu.eff_ref_len = "shortest" bleu.nonorm = 0 ref_fh = open(ref_file) cookedrefs = [] for ref in ref_fh: cookedref = bleu.cook_refs([ref]) cookedrefs.append(cookedref) ref_fh.close() nbest_fh = open(nbest_file) tests = [] i = -1 for line in nbest_fh: fields = line.split("||| ") current_i = int(fields[0]) text = fields[1] if i != current_i: tests.append([]) i = current_i tests[-1].append(text) nbest_fh.close() # score with first best cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) stats = " ".join(["%d %d" % (c,g) for (c,g) in zip(cookedtest['correct'], cookedtest['guess'])]) print " %s %d" % (stats ,cookedtest['reflen']) cookedtests.append(cookedtest) bleu1 = bleu.score_cooked(cookedtests) # vary, and score again cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] if i == 7: sentence = tests[i][8] elif i == 1: sentences = tests[i][2] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) cookedtests.append(cookedtest) bleu2 = bleu.score_cooked(cookedtests) print "Bleus: ", bleu1,bleu2
def batch_bleu(cans, refs): """ cans : [ 'XXX', 'XXX', ... ] refs : [ ['XXX', 'XXX', ... ], ['XXX', 'XXX', ... ], ... ] """ bleu_scores = np.zeros((len(cans), 3)) for i, can in enumerate(cans): n1 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=1), n=1)], n=1) n2 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=2), n=2)], n=2) n3 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=3), n=3)], n=3) bleu_scores[i] = [n1,n2,n3] return bleu_scores
def bleu_single(test,cookedrefs,n=4,addprec=1): comps=bleu.cook_test(test,cookedrefs,n=n) p=1. for k in xrange(n): p *= float(comps['correct'][k]+addprec)/(comps['guess'][k]+addprec) p = p ** (1./n) if 0 < comps['testlen'] < comps['reflen']: p *= math.exp(1-float(comps['reflen'])/comps['testlen']) return p
def process(sentnum, testsents): candsfile.write("%d %d\n" % (cur_sentnum, len(testsents))) for (sent,vector) in testsents: comps = bleu.cook_test(sent, cookedrefs[sentnum]) if comps['testlen'] != comps['guess'][0]: sys.stderr.write("ERROR: test length != guessed 1-grams\n") featsfile.write("%s %s %d\n" % (" ".join([str(v) for v in vector]), " ".join(["%d %d" % (c,g) for (c,g) in zip(comps['correct'], comps['guess'])]), comps['reflen']))
def bleu_single(test, cookedrefs, n=4, addprec=1): comps = bleu.cook_test(test, cookedrefs, n=n) p = 1. for k in xrange(n): p *= float(comps['correct'][k] + addprec) / (comps['guess'][k] + addprec) p = p**(1. / n) if 0 < comps['testlen'] < comps['reflen']: p *= math.exp(1 - float(comps['reflen']) / comps['testlen']) return p
def process(sentnum, testsents): candsfile.write("%d %d\n" % (cur_sentnum, len(testsents))) for (sent, vector) in testsents: comps = bleu.cook_test(sent, cookedrefs[sentnum]) if comps['testlen'] != comps['guess'][0]: sys.stderr.write("ERROR: test length != guessed 1-grams\n") featsfile.write("%s %s %d\n" % (" ".join([str(v) for v in vector]), " ".join([ "%d %d" % (c, g) for (c, g) in zip(comps['correct'], comps['guess']) ]), comps['reflen']))
def compute_bleu(net, word_dict, index_dict, tokens, initial=None, IM=None): """ Return BLEU scores for reference tokens For each reference caption, a candidate caption is sampled from net """ bleu_scores = np.zeros((len(tokens), 3)) for i, ref in enumerate(tokens): if initial != None: init = copy.deepcopy(initial) else: init = None ref = ref[net.context:][:-1] if IM != None: can = sample(net, word_dict, index_dict, len(ref), IM[i], initial=init) else: can = sample(net, word_dict, index_dict, len(ref), initial=init) # Compute bleu using n = (1,2,3) n1 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=1), n=1)], n=1) n2 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=2), n=2)], n=2) n3 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=3), n=3)], n=3) bleu_scores[i] = [n1,n2,n3] return bleu_scores
for lines in itertools.izip(*[file(filename) for filename in args[1:]]): cookedrefs.append(bleu.cook_refs(lines, n=n)) if opts.mapfilename is not None: linemap = [] for line in file(opts.mapfilename): linemap.append(int(line)) else: linemap = range(len(cookedrefs)) if args[0] == "-": infile = sys.stdin else: infile = open(args[0]) test1 = [] for (line,i) in itertools.izip(infile, linemap): test1.append(bleu.cook_test(line, cookedrefs[i], n=n)) total = 0. n_sent = 0 for comps in test1: score = score_single_cooked(comps) sys.stdout.write("bleu+1=%f\n" % score) total += score n_sent += 1 sys.stderr.write("average: %s\n" % (total/n_sent))
n = 4 cookedrefs = [] for lines in itertools.izip(*[file(filename) for filename in args[1:]]): cookedrefs.append(bleu.cook_refs([line.split() for line in lines], n=n)) if opts.mapfilename is not None: linemap = [] for line in file(opts.mapfilename): linemap.append(int(line)) else: linemap = range(len(cookedrefs)) test1 = [] for (line,i) in itertools.izip(file(args[0]), linemap): test1.append(bleu.cook_test(line.split(), cookedrefs[i], n=n)) total = 0. n_sent = 0 for comps in test1: if comps['testlen'] == 0: sys.stdout.write("0\n") continue logbleu = 0.0 for k in xrange(n): logbleu += math.log(comps['correct'][k]+1)-math.log(comps['guess'][k]+1) #sys.stdout.write("%d/%d " % (comps['correct'][k], comps['guess'][k])) logbleu /= float(n) if opts.brevitypenalty:
for line in infile: prefeats = parse_nbest(line.strip()) feats = dd(lambda: "0") feats.update(prefeats) hyp = feats[hypkey].lstrip("{").rstrip("}") sent = int(feats[sentkey])-1 # write hyp to temp file hypfile.write(hyp+"\n") # write id, components, features to tuning file tunefile.write("%d ||| " % sent) # convert hyp to components using bleu stuff cook = bleu.cook_test(hyp.split(), cookedrefs[sent], n=bleun) for k in range(bleun): tunefile.write("%d " % cook["correct"][k]) tunefile.write("%d " % cook["guess"][k]) tunefile.write("%d ||| " % cook["reflen"]) # pull out tuned features for feat in args.feats: tunefile.write(str(-(float(feats[feat])))+" ") # form model feature from untuned features modelscore = 0.0 for fname, fval in feats.iteritems(): if fname in modelweights: modelscore += -(float(fval))*modelweights[fname] tunefile.write("%f\n" % modelscore) hypfile.close()
(opts, args) = getopt.getopt(sys.argv[1:], "rctpv", []) for (opt, parm) in opts: if opt == "-c": bleu.preserve_case = True elif opt == "-t": bleu.nist_tokenize = False elif opt == "-p": bleu.clip_len = True elif opt == "-v": verbose = True test1 = [] test2 = [] for lines in itertools.izip(*[file(filename) for filename in args]): cookedrefs = bleu.cook_refs(lines[2:]) test1.append(bleu.cook_test(lines[0], cookedrefs)) test2.append(bleu.cook_test(lines[1], cookedrefs)) score1 = bleu.score_cooked(test1) print "System 1: %f" % score1 print "System 2: %f" % bleu.score_cooked(test2) better = worse = 0 fake = test1[:] for i in xrange(len(fake)): fake[i] = test2[i] fake_score = bleu.score_cooked(fake) if fake_score > score1: better += 1 elif fake_score < score1:
line = infile.readline() while line != "": try: (sentnum, sent, vector) = line.split('|||') except: sys.stderr.write("ERROR: bad input line: %s\n" % line) continue sentnum = int(sentnum) sent = " ".join(sent.split()) vector = vector.strip() if False and sent == "": progress += 1 line = infile.readline() continue comps = bleu.cook_test(sent, cookedrefs[sentnum]) if comps['testlen'] != comps['guess'][0]: sys.stderr.write("ERROR: test length != guessed 1-grams\n") sys.stdout.write("%d ||| %s %d ||| %s\n" % (sentnum, " ".join(["%d %d" % (c,g) for (c,g) in zip(comps['correct'], comps['guess'])]), comps['reflen'], vector)) sys.stdout.flush() if sentnum != cur_sentnum: sys.stderr.write(".") sys.stderr.flush() cur_sentnum = sentnum
for line in infile: prefeats = parse_nbest(line.strip()) feats = dd(lambda: "0") feats.update(prefeats) hyp = feats[hypkey].lstrip("{").rstrip("}") sent = int(feats[sentkey]) - 1 # write hyp to temp file hypfile.write(hyp + "\n") # write id, components, features to tuning file tunefile.write("%d ||| " % sent) # convert hyp to components using bleu stuff cook = bleu.cook_test(hyp.split(), cookedrefs[sent], n=bleun) for k in range(bleun): tunefile.write("%d " % cook["correct"][k]) tunefile.write("%d " % cook["guess"][k]) tunefile.write("%d ||| " % cook["reflen"]) # pull out tuned features for feat in args.feats: tunefile.write(str(-(float(feats[feat]))) + " ") # form model feature from untuned features modelscore = 0.0 for fname, fval in feats.iteritems(): if fname in modelweights: modelscore += -(float(fval)) * modelweights[fname] tunefile.write("%f\n" % modelscore) hypfile.close()