def compute_bleu(net, word_dict, index_dict, tokens, initial=None, IM=None): """ Return BLEU scores for reference tokens For each reference caption, a candidate caption is sampled from net """ bleu_scores = np.zeros((len(tokens), 3)) for i, ref in enumerate(tokens): if initial != None: init = copy.deepcopy(initial) else: init = None ref = ref[net.context:][:-1] if IM != None: can = sample(net, word_dict, index_dict, len(ref), IM[i], initial=init) else: can = sample(net, word_dict, index_dict, len(ref), initial=init) # Compute bleu using n = (1,2,3) n1 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=1), n=1)], n=1) n2 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=2), n=2)], n=2) n3 = bleu.score_cooked( [bleu.cook_test(can, bleu.cook_refs([ref], n=3), n=3)], n=3) bleu_scores[i] = [n1, n2, n3] return bleu_scores
def main(): sys.path.append("../scripts/training/cmert-0.5") import bleu data_dir = "test_scorer_data" nbest_file = os.path.join(data_dir, "nbest.out") ref_file = os.path.join(data_dir, "reference.txt") bleu.preserve_case = False bleu.eff_ref_len = "shortest" bleu.nonorm = 0 ref_fh = open(ref_file) cookedrefs = [] for ref in ref_fh: cookedref = bleu.cook_refs([ref]) cookedrefs.append(cookedref) ref_fh.close() nbest_fh = open(nbest_file) tests = [] i = -1 for line in nbest_fh: fields = line.split("||| ") current_i = int(fields[0]) text = fields[1] if i != current_i: tests.append([]) i = current_i tests[-1].append(text) nbest_fh.close() # score with first best cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) stats = " ".join([ "%d %d" % (c, g) for (c, g) in zip(cookedtest['correct'], cookedtest['guess']) ]) print " %s %d" % (stats, cookedtest['reflen']) cookedtests.append(cookedtest) bleu1 = bleu.score_cooked(cookedtests) # vary, and score again cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] if i == 7: sentence = tests[i][8] elif i == 1: sentences = tests[i][2] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) cookedtests.append(cookedtest) bleu2 = bleu.score_cooked(cookedtests) print "Bleus: ", bleu1, bleu2
def main(): sys.path.append("../scripts/training/cmert-0.5") import bleu data_dir = "test_scorer_data" nbest_file = os.path.join(data_dir,"nbest.out") ref_file = os.path.join(data_dir,"reference.txt") bleu.preserve_case = False bleu.eff_ref_len = "shortest" bleu.nonorm = 0 ref_fh = open(ref_file) cookedrefs = [] for ref in ref_fh: cookedref = bleu.cook_refs([ref]) cookedrefs.append(cookedref) ref_fh.close() nbest_fh = open(nbest_file) tests = [] i = -1 for line in nbest_fh: fields = line.split("||| ") current_i = int(fields[0]) text = fields[1] if i != current_i: tests.append([]) i = current_i tests[-1].append(text) nbest_fh.close() # score with first best cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) stats = " ".join(["%d %d" % (c,g) for (c,g) in zip(cookedtest['correct'], cookedtest['guess'])]) print " %s %d" % (stats ,cookedtest['reflen']) cookedtests.append(cookedtest) bleu1 = bleu.score_cooked(cookedtests) # vary, and score again cookedtests = [] for i in range(len(tests)): sentence = tests[i][0] if i == 7: sentence = tests[i][8] elif i == 1: sentences = tests[i][2] cookedtest = (bleu.cook_test(sentence, cookedrefs[i])) cookedtests.append(cookedtest) bleu2 = bleu.score_cooked(cookedtests) print "Bleus: ", bleu1,bleu2
def batch_bleu(cans, refs): """ cans : [ 'XXX', 'XXX', ... ] refs : [ ['XXX', 'XXX', ... ], ['XXX', 'XXX', ... ], ... ] """ bleu_scores = np.zeros((len(cans), 3)) for i, can in enumerate(cans): n1 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=1), n=1)], n=1) n2 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=2), n=2)], n=2) n3 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=3), n=3)], n=3) bleu_scores[i] = [n1,n2,n3] return bleu_scores
def compute_bleu(net, word_dict, index_dict, tokens, initial=None, IM=None): """ Return BLEU scores for reference tokens For each reference caption, a candidate caption is sampled from net """ bleu_scores = np.zeros((len(tokens), 3)) for i, ref in enumerate(tokens): if initial != None: init = copy.deepcopy(initial) else: init = None ref = ref[net.context:][:-1] if IM != None: can = sample(net, word_dict, index_dict, len(ref), IM[i], initial=init) else: can = sample(net, word_dict, index_dict, len(ref), initial=init) # Compute bleu using n = (1,2,3) n1 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=1), n=1)], n=1) n2 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=2), n=2)], n=2) n3 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=3), n=3)], n=3) bleu_scores[i] = [n1,n2,n3] return bleu_scores
bleu.preserve_case = True elif opt == "-t": bleu.nist_tokenize = False elif opt == "-p": bleu.clip_len = True elif opt == "-v": verbose = True test1 = [] test2 = [] for lines in itertools.izip(*[file(filename) for filename in args]): cookedrefs = bleu.cook_refs(lines[2:]) test1.append(bleu.cook_test(lines[0], cookedrefs)) test2.append(bleu.cook_test(lines[1], cookedrefs)) score1 = bleu.score_cooked(test1) print "System 1: %f" % score1 print "System 2: %f" % bleu.score_cooked(test2) better = worse = 0 fake = test1[:] for i in xrange(len(fake)): fake[i] = test2[i] fake_score = bleu.score_cooked(fake) if fake_score > score1: better += 1 elif fake_score < score1: worse += 1 if verbose:
bleu.preserve_case = True elif opt == "-t": bleu.nist_tokenize = False elif opt == "-p": bleu.clip_len = True elif opt == "-v": verbose = True test1 = [] test2 = [] for lines in itertools.izip(*[file(filename) for filename in args]): cookedrefs = bleu.cook_refs(lines[2:]) test1.append(bleu.cook_test(lines[0], cookedrefs)) test2.append(bleu.cook_test(lines[1], cookedrefs)) print "System 1: %f" % bleu.score_cooked(test1) print "System 2: %f" % bleu.score_cooked(test2) better1 = better2 = 0 n = 1000 diffs = [] for i in xrange(n): fake1 = [] fake2 = [] for j in xrange(len(test1)): r = random.randrange(len(test1)) fake1.append(test1[r]) fake2.append(test2[r])