コード例 #1
0
def compute_bleu(net, word_dict, index_dict, tokens, initial=None, IM=None):
    """
    Return BLEU scores for reference tokens
    For each reference caption, a candidate caption is sampled from net
    """
    bleu_scores = np.zeros((len(tokens), 3))
    for i, ref in enumerate(tokens):
        if initial != None:
            init = copy.deepcopy(initial)
        else:
            init = None
        ref = ref[net.context:][:-1]
        if IM != None:
            can = sample(net,
                         word_dict,
                         index_dict,
                         len(ref),
                         IM[i],
                         initial=init)
        else:
            can = sample(net, word_dict, index_dict, len(ref), initial=init)

        # Compute bleu using n = (1,2,3)
        n1 = bleu.score_cooked(
            [bleu.cook_test(can, bleu.cook_refs([ref], n=1), n=1)], n=1)
        n2 = bleu.score_cooked(
            [bleu.cook_test(can, bleu.cook_refs([ref], n=2), n=2)], n=2)
        n3 = bleu.score_cooked(
            [bleu.cook_test(can, bleu.cook_refs([ref], n=3), n=3)], n=3)
        bleu_scores[i] = [n1, n2, n3]

    return bleu_scores
コード例 #2
0
ファイル: lm_tools.py プロジェクト: SunnyWay/im2txtDemo
def batch_bleu(cans, refs):
    """
    cans : [ 'XXX', 'XXX', ... ]
    refs : [ ['XXX', 'XXX', ... ], ['XXX', 'XXX', ... ], ... ]
    """
    bleu_scores = np.zeros((len(cans), 3))
    for i, can in enumerate(cans):
        n1 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=1), n=1)], n=1)
        n2 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=2), n=2)], n=2)
        n3 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs(refs[i], n=3), n=3)], n=3)
        bleu_scores[i] = [n1,n2,n3]
    return bleu_scores
コード例 #3
0
def main():
    sys.path.append("../scripts/training/cmert-0.5")
    import bleu
    data_dir = "test_scorer_data"
    nbest_file = os.path.join(data_dir, "nbest.out")
    ref_file = os.path.join(data_dir, "reference.txt")
    bleu.preserve_case = False
    bleu.eff_ref_len = "shortest"
    bleu.nonorm = 0

    ref_fh = open(ref_file)
    cookedrefs = []
    for ref in ref_fh:
        cookedref = bleu.cook_refs([ref])
        cookedrefs.append(cookedref)
    ref_fh.close()

    nbest_fh = open(nbest_file)
    tests = []
    i = -1
    for line in nbest_fh:
        fields = line.split("||| ")
        current_i = int(fields[0])
        text = fields[1]
        if i != current_i:
            tests.append([])
            i = current_i
        tests[-1].append(text)
    nbest_fh.close()

    #  score with first best
    cookedtests = []
    for i in range(len(tests)):
        sentence = tests[i][0]
        cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
        stats = " ".join([
            "%d %d" % (c, g)
            for (c, g) in zip(cookedtest['correct'], cookedtest['guess'])
        ])
        print " %s %d" % (stats, cookedtest['reflen'])
        cookedtests.append(cookedtest)
    bleu1 = bleu.score_cooked(cookedtests)

    # vary, and score again
    cookedtests = []
    for i in range(len(tests)):
        sentence = tests[i][0]
        if i == 7:
            sentence = tests[i][8]
        elif i == 1:
            sentences = tests[i][2]
        cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
        cookedtests.append(cookedtest)
    bleu2 = bleu.score_cooked(cookedtests)

    print "Bleus: ", bleu1, bleu2
コード例 #4
0
ファイル: test_scorer.py プロジェクト: awildfox/moses
def main():
    sys.path.append("../scripts/training/cmert-0.5")
    import bleu
    data_dir = "test_scorer_data"
    nbest_file = os.path.join(data_dir,"nbest.out")
    ref_file = os.path.join(data_dir,"reference.txt")
    bleu.preserve_case = False
    bleu.eff_ref_len = "shortest"
    bleu.nonorm = 0

    ref_fh = open(ref_file)
    cookedrefs = []
    for ref in ref_fh:
        cookedref = bleu.cook_refs([ref])
        cookedrefs.append(cookedref)
    ref_fh.close()
    
    nbest_fh = open(nbest_file)
    tests = []
    i = -1
    for line in nbest_fh:
        fields = line.split("||| ")
        current_i = int(fields[0])
        text = fields[1]
        if i != current_i:
            tests.append([])
            i = current_i
        tests[-1].append(text)
    nbest_fh.close()

    #  score with first best
    cookedtests = []
    for i  in range(len(tests)):
        sentence = tests[i][0]
        cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
        stats = " ".join(["%d %d" % (c,g) for (c,g) in zip(cookedtest['correct'], cookedtest['guess'])])
        print " %s %d" % (stats ,cookedtest['reflen'])
        cookedtests.append(cookedtest)
    bleu1 = bleu.score_cooked(cookedtests)

    # vary, and score again
    cookedtests = []
    for i in range(len(tests)):
        sentence = tests[i][0]
        if i == 7:
            sentence = tests[i][8]
        elif i == 1:
            sentences = tests[i][2]
        cookedtest = (bleu.cook_test(sentence, cookedrefs[i]))
        cookedtests.append(cookedtest)
    bleu2 = bleu.score_cooked(cookedtests)
    

    print "Bleus: ", bleu1,bleu2
コード例 #5
0
def compute_bleu(net, word_dict, index_dict, tokens, initial=None, IM=None):
    """
    Return BLEU scores for reference tokens
    For each reference caption, a candidate caption is sampled from net
    """
    bleu_scores = np.zeros((len(tokens), 3))
    for i, ref in enumerate(tokens):
        if initial != None:
            init = copy.deepcopy(initial)
        else:
            init = None
        ref = ref[net.context:][:-1]
        if IM != None:
            can = sample(net, word_dict, index_dict, len(ref), IM[i], initial=init)
        else:
            can = sample(net, word_dict, index_dict, len(ref), initial=init)

        # Compute bleu using n = (1,2,3)
        n1 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=1), n=1)], n=1)
        n2 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=2), n=2)], n=2)
        n3 = bleu.score_cooked([bleu.cook_test(can, bleu.cook_refs([ref], n=3), n=3)], n=3)
        bleu_scores[i] = [n1,n2,n3]

    return bleu_scores
コード例 #6
0
ファイル: score-nbest.py プロジェクト: awildfox/moses
            bleu.preserve_case = True
        if opt == "-a":
            bleu.eff_ref_len = "average"
        if opt == "-s":
            bleu.eff_ref_len = "shortest"
        if opt == "-e":
            bleu.eff_ref_len = "closest"
        if opt == "-n":
            bleu.nonorm = 1

    print args    
    cookedrefs = []
    reffiles = [file(name) for name in args[:-1]]
    print reffiles
    for refs in itertools.izip(*reffiles):
        cookedrefs.append(bleu.cook_refs(refs))
    
    outprefix = args[-1]

    featsfile = file(outprefix+"feats.opt", "w")
    candsfile = file(outprefix+"cands.opt", "w")

    cur_sentnum = None
    testsents = set()
    progress = 0

    infile = sys.stdin

    # function that recognizes floats
    re_float=re.compile(r'^-?[-0-9.e\+]+$')
    is_float=lambda(x):re_float.match(x)
コード例 #7
0
def mbr_best(lines, nbest, cost_weighting, rank_limit, rank_weight, sys_weights, sys_cost_bases=None,fast=False,addprec=1,eff_ref_len="average",n=4,cost_base=None,normalize_cost_base=False,per_system_norm=True):
    if sys_weights == None or len(sys_weights)==0:
        no_syswt=True
        sys_weights={}
    else:
        no_syswt=False
        sumsw=sum(sys_weights.itervalues())
        sys_weights=dict((s,(w/sumsw)) for s,w in sys_weights.iteritems())

        
    use_cost_base = (cost_base != None or sys_cost_bases != None)
    if use_cost_base and sys_cost_bases == None:
        sys_cost_bases = {}
        
    if nbest:
        start = 6  # added system also to beginning
    else:
        start = 1

    entries_per_system = {}
    best_system_score = {} # these are set by first/last in input, rather than assuming more positive -> better
    worst_system_score = {}
    max_system_cost = {}
#    cookedrefs = []
    ref_probs = []
    ref_syss = []
    splits=[line.split(None,start) for line in lines]
    hyps=[bleu.precook(s[start]) for s in splits]
    sump_sys = {}
    for split_ref in splits:                
        sysname = split_ref[0]
        entries_per_system[sysname] = entries_per_system.get(sysname, 0) + 1
        if nbest:
            score = float(split_ref[5])
            cost = -score
            if sysname not in best_system_score:
                best_system_score[sysname]=score
            worst_system_score[sysname]=score
            if max_system_cost.get(sysname, 0) < cost:
                max_system_cost[sysname] = cost
                
#    diff_system_score = dict((s,w-best_system_score[s]) for s,w in worst_system_score.iteritems())

    sump=0
    for split_ref in splits:
        p = 1.0
        
        sysname = split_ref[0]
        
        if not per_system_norm and sysname in sys_weights:
#            pdb.set_trace()        
            p *= sys_weights[sysname]
            
        if nbest:
            if rank_weight != None:
                rank = int(split_ref[4])
                p *= 1./(rank_weight + rank)
            if use_cost_base and sysname in best_system_score:
                bss=best_system_score[sysname]
                score = float(split_ref[5])-bss
                if normalize_cost_base:
                    diff=bss-worst_system_score[sysname]
                    #diff=diff_system_score[sysname]
                    if diff!=0:
                        score /= diff
                p *= math.pow(sys_cost_bases.get(sysname,cost_base),score)
            if cost_weighting and max_system_cost.get(sysname, 0) > 0.0:
                cost = -float(split_ref[5])
                p *= cost/max_system_cost[sysname]
        sump+=p
        sump_sys[sysname]=sump_sys.get(sysname,0)+p
        ref_probs.append(p)
        ref_syss.append(sysname)
        
    if per_system_norm:
        if no_syswt:
            wsys=1./len(sump_sys)
            sys_weights=dict.fromkeys(sump_sys.iterkeys(),wsys)
#            dump(sys_weights)
        assert isclose(sum(sys_weights.itervalues()),1)
            
        mult_sys=dict((n,(sys_weights[n]/s)) for n,s
                      in sump_sys.iteritems());
#        dump([(sum([p for n,p in zip(ref_syss,ref_probs) if n==sn]),sn,sump_sys[sn],mult_sys[sn]) for sn in sump_sys.keys()])
        ref_probs=[mult_sys[sn]*p for sn,p in itertools.izip(ref_syss,ref_probs)]
        dump(sum(ref_probs))
    elif sump != 1.0:
        oos=1./sump
        ref_probs=[p*oos for p in ref_probs]        

    if fast:
        expected_ref=cook_expected_ref(hyps,n=n,p_refs=ref_probs,eff_ref_len=eff_ref_len)
    else:
        cookedrefs = [ bleu.cook_refs([s[start]],n=n) for s in splits ]
        
        
    max_items = []
    avg_bleu = []
    N=len(hyps)
    for test in xrange(N):
        split_test = splits[test]
        avg_test_bleu = 0.0

        if nbest and rank_limit != None:
            test_rank = int(split_test[4])
            if test_rank > rank_limit:
                avg_bleu.append(avg_test_bleu)
                continue

        if fast:
            avg_test_bleu = score_vs_cooked(hyps[test],expected_ref,n=n,addprec=addprec)
        else:
            for ref in xrange(N):
                split_ref = splits[ref]
                factor = ref_probs[ref]
#                score=1.
                if ref != test: # each system gets to vote for itself
                    score=score_vs_cooked(hyps[test],cookedrefs[ref],n=n,addprec=addprec)
                else:
                    score=1.
                avg_test_bleu += ref_probs[ref]*score
        avg_bleu.append(avg_test_bleu)

        if len(max_items) == 0 or avg_test_bleu == avg_bleu[max_items[0]]:
            max_items.append(test)
        elif avg_test_bleu > avg_bleu[max_items[0]]:
            max_items = []
            max_items.append(test)
    dump(avg_bleu)
#    dump([x/avg_bleu[0] for x in avg_bleu])
    return max_items,avg_bleu
コード例 #8
0
def mbr_best(lines,
             nbest,
             cost_weighting,
             rank_limit,
             rank_weight,
             sys_weights,
             sys_cost_bases=None,
             fast=False,
             addprec=1,
             eff_ref_len="average",
             n=4,
             cost_base=None,
             normalize_cost_base=False,
             per_system_norm=True):
    if sys_weights == None or len(sys_weights) == 0:
        no_syswt = True
        sys_weights = {}
    else:
        no_syswt = False
        sumsw = sum(sys_weights.itervalues())
        sys_weights = dict(
            (s, (w / sumsw)) for s, w in sys_weights.iteritems())

    use_cost_base = (cost_base != None or sys_cost_bases != None)
    if use_cost_base and sys_cost_bases == None:
        sys_cost_bases = {}

    if nbest:
        start = 6  # added system also to beginning
    else:
        start = 1

    entries_per_system = {}
    best_system_score = {
    }  # these are set by first/last in input, rather than assuming more positive -> better
    worst_system_score = {}
    max_system_cost = {}
    #    cookedrefs = []
    ref_probs = []
    ref_syss = []
    splits = [line.split(None, start) for line in lines]
    hyps = [bleu.precook(s[start]) for s in splits]
    sump_sys = {}
    for split_ref in splits:
        sysname = split_ref[0]
        entries_per_system[sysname] = entries_per_system.get(sysname, 0) + 1
        if nbest:
            score = float(split_ref[5])
            cost = -score
            if sysname not in best_system_score:
                best_system_score[sysname] = score
            worst_system_score[sysname] = score
            if max_system_cost.get(sysname, 0) < cost:
                max_system_cost[sysname] = cost

#    diff_system_score = dict((s,w-best_system_score[s]) for s,w in worst_system_score.iteritems())

    sump = 0
    for split_ref in splits:
        p = 1.0

        sysname = split_ref[0]

        if not per_system_norm and sysname in sys_weights:
            #            pdb.set_trace()
            p *= sys_weights[sysname]

        if nbest:
            if rank_weight != None:
                rank = int(split_ref[4])
                p *= 1. / (rank_weight + rank)
            if use_cost_base and sysname in best_system_score:
                bss = best_system_score[sysname]
                score = float(split_ref[5]) - bss
                if normalize_cost_base:
                    diff = bss - worst_system_score[sysname]
                    #diff=diff_system_score[sysname]
                    if diff != 0:
                        score /= diff
                p *= math.pow(sys_cost_bases.get(sysname, cost_base), score)
            if cost_weighting and max_system_cost.get(sysname, 0) > 0.0:
                cost = -float(split_ref[5])
                p *= cost / max_system_cost[sysname]
        sump += p
        sump_sys[sysname] = sump_sys.get(sysname, 0) + p
        ref_probs.append(p)
        ref_syss.append(sysname)

    if per_system_norm:
        if no_syswt:
            wsys = 1. / len(sump_sys)
            sys_weights = dict.fromkeys(sump_sys.iterkeys(), wsys)
#            dump(sys_weights)
        assert isclose(sum(sys_weights.itervalues()), 1)

        mult_sys = dict(
            (n, (sys_weights[n] / s)) for n, s in sump_sys.iteritems())
        #        dump([(sum([p for n,p in zip(ref_syss,ref_probs) if n==sn]),sn,sump_sys[sn],mult_sys[sn]) for sn in sump_sys.keys()])
        ref_probs = [
            mult_sys[sn] * p for sn, p in itertools.izip(ref_syss, ref_probs)
        ]
        dump(sum(ref_probs))
    elif sump != 1.0:
        oos = 1. / sump
        ref_probs = [p * oos for p in ref_probs]

    if fast:
        expected_ref = cook_expected_ref(hyps,
                                         n=n,
                                         p_refs=ref_probs,
                                         eff_ref_len=eff_ref_len)
    else:
        cookedrefs = [bleu.cook_refs([s[start]], n=n) for s in splits]

    max_items = []
    avg_bleu = []
    N = len(hyps)
    for test in xrange(N):
        split_test = splits[test]
        avg_test_bleu = 0.0

        if nbest and rank_limit != None:
            test_rank = int(split_test[4])
            if test_rank > rank_limit:
                avg_bleu.append(avg_test_bleu)
                continue

        if fast:
            avg_test_bleu = score_vs_cooked(hyps[test],
                                            expected_ref,
                                            n=n,
                                            addprec=addprec)
        else:
            for ref in xrange(N):
                split_ref = splits[ref]
                factor = ref_probs[ref]
                #                score=1.
                if ref != test:  # each system gets to vote for itself
                    score = score_vs_cooked(hyps[test],
                                            cookedrefs[ref],
                                            n=n,
                                            addprec=addprec)
                else:
                    score = 1.
                avg_test_bleu += ref_probs[ref] * score
        avg_bleu.append(avg_test_bleu)

        if len(max_items) == 0 or avg_test_bleu == avg_bleu[max_items[0]]:
            max_items.append(test)
        elif avg_test_bleu > avg_bleu[max_items[0]]:
            max_items = []
            max_items.append(test)
    dump(avg_bleu)
    #    dump([x/avg_bleu[0] for x in avg_bleu])
    return max_items, avg_bleu
コード例 #9
0
ファイル: bleu_plus1.py プロジェクト: rupenp/transforest
        logbleu += min(0,1-(float(comps['reflen']+1))/(comps['testlen']+1))

    return math.exp(logbleu)

if __name__ == "__main__":
    optparser = optparse.OptionParser()
    optparser.add_option("-m", "--map-file", dest="mapfilename", help="map file indicating sentence number in reference set for each line of input")
    optparser.add_option("-b", "--brevity-penalty", dest="brevitypenalty", action="store_true", help="assess brevity penalty")
    (opts, args) = optparser.parse_args()

    
    n = 4

    cookedrefs = []
    for lines in itertools.izip(*[file(filename) for filename in args[1:]]):
        cookedrefs.append(bleu.cook_refs(lines, n=n))

    if opts.mapfilename is not None:
        linemap = []
        for line in file(opts.mapfilename):
            linemap.append(int(line))
    else:
        linemap = range(len(cookedrefs))

    if args[0] == "-":
        infile = sys.stdin
    else:
        infile = open(args[0])
    test1 = []
    for (line,i) in itertools.izip(infile, linemap):
        test1.append(bleu.cook_test(line, cookedrefs[i], n=n))
コード例 #10
0
bleu.normalize = normalize

# usage: bleu+1.py <test> <ref>+

if __name__ == "__main__":
    optparser = optparse.OptionParser()
    optparser.add_option("-m", "--map-file", dest="mapfilename", help="map file indicating sentence number in reference set for each line of input")
    optparser.add_option("-b", "--brevity-penalty", dest="brevitypenalty", action="store_true", help="assess brevity penalty")
    (opts, args) = optparser.parse_args()

    n = 4

    cookedrefs = []
    for lines in itertools.izip(*[file(filename) for filename in args[1:]]):
        cookedrefs.append(bleu.cook_refs([line.split() for line in lines], n=n))

    if opts.mapfilename is not None:
        linemap = []
        for line in file(opts.mapfilename):
            linemap.append(int(line))
    else:
        linemap = range(len(cookedrefs))

    test1 = []
    for (line,i) in itertools.izip(file(args[0]), linemap):
        test1.append(bleu.cook_test(line.split(), cookedrefs[i], n=n))

    total = 0.
    n_sent = 0
コード例 #11
0
            bleu.preserve_case = True
        if opt == "-a":
            bleu.eff_ref_len = "average"
        if opt == "-s":
            bleu.eff_ref_len = "shortest"
        if opt == "-e":
            bleu.eff_ref_len = "closest"
        if opt == "-n":
            bleu.nonorm = 1

    print args
    cookedrefs = []
    reffiles = [file(name) for name in args[:-1]]
    print reffiles
    for refs in itertools.izip(*reffiles):
        cookedrefs.append(bleu.cook_refs(refs))

    outprefix = args[-1]

    featsfile = file(outprefix + "feats.opt", "w")
    candsfile = file(outprefix + "cands.opt", "w")

    cur_sentnum = None
    testsents = set()
    progress = 0

    infile = sys.stdin

    # function that recognizes floats
    re_float = re.compile(r'^-?[-0-9.e\+]+$')
    is_float = lambda (x): re_float.match(x)
コード例 #12
0
ファイル: bleu-sign-test.py プロジェクト: rupenp/transforest
    (opts, args) = getopt.getopt(sys.argv[1:], "rctpv", [])
    for (opt, parm) in opts:
        if opt == "-c":
            bleu.preserve_case = True
        elif opt == "-t":
            bleu.nist_tokenize = False
        elif opt == "-p":
            bleu.clip_len = True
        elif opt == "-v":
            verbose = True

    test1 = []
    test2 = []
    for lines in itertools.izip(*[file(filename) for filename in args]):
        cookedrefs = bleu.cook_refs(lines[2:])
        test1.append(bleu.cook_test(lines[0], cookedrefs))
        test2.append(bleu.cook_test(lines[1], cookedrefs))

    score1 = bleu.score_cooked(test1)
    print "System 1: %f" % score1
    print "System 2: %f" % bleu.score_cooked(test2)

    better = worse = 0
    fake = test1[:]
    for i in xrange(len(fake)):
        fake[i] = test2[i]

        fake_score = bleu.score_cooked(fake)
        if fake_score > score1:
            better += 1
コード例 #13
0
ファイル: runrerank.py プロジェクト: jungikim/sbmt
    modelweights = dict()
    for wname, wval in weights.iteritems():
        if wname not in args.feats:
            modelweights[wname] = float(wval)

    # write extra random weights to temp file
    for point in range(args.randoms):
        startweights.write(
            ' '.join(map(str, np.random.rand(len(args.feats) + 1))) + "\n")
    startweights.close()

    # cook references for comps
    cookedrefs = []
    for lines in itertools.izip(*(args.reference)):
        cookedrefs.append(
            bleu.cook_refs([line.split() for line in lines], n=bleun))

    for line in infile:
        prefeats = parse_nbest(line.strip())
        feats = dd(lambda: "0")
        feats.update(prefeats)
        hyp = feats[hypkey].lstrip("{").rstrip("}")
        sent = int(feats[sentkey]) - 1

        # write hyp to temp file
        hypfile.write(hyp + "\n")

        # write id, components, features to tuning file

        tunefile.write("%d ||| " % sent)