def cook_expected_ref(refs,n=4,p_refs=None,eff_ref_len="average"):
    '''same input as bleu.cook_refs, but maxcounts is avg. rather than max.
    p_refs is a parallel (to refs) list of weights.  uniform 1/n will be used if
    None'''
    
    if p_refs == None:
        l=len(refs)
        p_refs=[1.0/l]*l
    assert isclose(sum(p_refs),1)
        
    reflen = []
    #    ecount=collections.defaultdict(float)
    ecount = {}

    for ref,p in itertools.izip(refs,p_refs):
        rl, counts = bleu.precook(ref, n)
        reflen.append(rl)
        for (ngram,count) in counts.iteritems():
            #ecounts[ngram] += p*count
            ecount[ngram] = ecount.get(ngram,0)+p*count

    return collapse_reflen(reflen,eff_ref_len),ecount
Esempio n. 2
0
def cook_expected_ref(refs, n=4, p_refs=None, eff_ref_len="average"):
    '''same input as bleu.cook_refs, but maxcounts is avg. rather than max.
    p_refs is a parallel (to refs) list of weights.  uniform 1/n will be used if
    None'''

    if p_refs == None:
        l = len(refs)
        p_refs = [1.0 / l] * l
    assert isclose(sum(p_refs), 1)

    reflen = []
    #    ecount=collections.defaultdict(float)
    ecount = {}

    for ref, p in itertools.izip(refs, p_refs):
        rl, counts = bleu.precook(ref, n)
        reflen.append(rl)
        for (ngram, count) in counts.iteritems():
            #ecounts[ngram] += p*count
            ecount[ngram] = ecount.get(ngram, 0) + p * count

    return collapse_reflen(reflen, eff_ref_len), ecount
def mbr_best(lines, nbest, cost_weighting, rank_limit, rank_weight, sys_weights, sys_cost_bases=None,fast=False,addprec=1,eff_ref_len="average",n=4,cost_base=None,normalize_cost_base=False,per_system_norm=True):
    if sys_weights == None or len(sys_weights)==0:
        no_syswt=True
        sys_weights={}
    else:
        no_syswt=False
        sumsw=sum(sys_weights.itervalues())
        sys_weights=dict((s,(w/sumsw)) for s,w in sys_weights.iteritems())

        
    use_cost_base = (cost_base != None or sys_cost_bases != None)
    if use_cost_base and sys_cost_bases == None:
        sys_cost_bases = {}
        
    if nbest:
        start = 6  # added system also to beginning
    else:
        start = 1

    entries_per_system = {}
    best_system_score = {} # these are set by first/last in input, rather than assuming more positive -> better
    worst_system_score = {}
    max_system_cost = {}
#    cookedrefs = []
    ref_probs = []
    ref_syss = []
    splits=[line.split(None,start) for line in lines]
    hyps=[bleu.precook(s[start]) for s in splits]
    sump_sys = {}
    for split_ref in splits:                
        sysname = split_ref[0]
        entries_per_system[sysname] = entries_per_system.get(sysname, 0) + 1
        if nbest:
            score = float(split_ref[5])
            cost = -score
            if sysname not in best_system_score:
                best_system_score[sysname]=score
            worst_system_score[sysname]=score
            if max_system_cost.get(sysname, 0) < cost:
                max_system_cost[sysname] = cost
                
#    diff_system_score = dict((s,w-best_system_score[s]) for s,w in worst_system_score.iteritems())

    sump=0
    for split_ref in splits:
        p = 1.0
        
        sysname = split_ref[0]
        
        if not per_system_norm and sysname in sys_weights:
#            pdb.set_trace()        
            p *= sys_weights[sysname]
            
        if nbest:
            if rank_weight != None:
                rank = int(split_ref[4])
                p *= 1./(rank_weight + rank)
            if use_cost_base and sysname in best_system_score:
                bss=best_system_score[sysname]
                score = float(split_ref[5])-bss
                if normalize_cost_base:
                    diff=bss-worst_system_score[sysname]
                    #diff=diff_system_score[sysname]
                    if diff!=0:
                        score /= diff
                p *= math.pow(sys_cost_bases.get(sysname,cost_base),score)
            if cost_weighting and max_system_cost.get(sysname, 0) > 0.0:
                cost = -float(split_ref[5])
                p *= cost/max_system_cost[sysname]
        sump+=p
        sump_sys[sysname]=sump_sys.get(sysname,0)+p
        ref_probs.append(p)
        ref_syss.append(sysname)
        
    if per_system_norm:
        if no_syswt:
            wsys=1./len(sump_sys)
            sys_weights=dict.fromkeys(sump_sys.iterkeys(),wsys)
#            dump(sys_weights)
        assert isclose(sum(sys_weights.itervalues()),1)
            
        mult_sys=dict((n,(sys_weights[n]/s)) for n,s
                      in sump_sys.iteritems());
#        dump([(sum([p for n,p in zip(ref_syss,ref_probs) if n==sn]),sn,sump_sys[sn],mult_sys[sn]) for sn in sump_sys.keys()])
        ref_probs=[mult_sys[sn]*p for sn,p in itertools.izip(ref_syss,ref_probs)]
        dump(sum(ref_probs))
    elif sump != 1.0:
        oos=1./sump
        ref_probs=[p*oos for p in ref_probs]        

    if fast:
        expected_ref=cook_expected_ref(hyps,n=n,p_refs=ref_probs,eff_ref_len=eff_ref_len)
    else:
        cookedrefs = [ bleu.cook_refs([s[start]],n=n) for s in splits ]
        
        
    max_items = []
    avg_bleu = []
    N=len(hyps)
    for test in xrange(N):
        split_test = splits[test]
        avg_test_bleu = 0.0

        if nbest and rank_limit != None:
            test_rank = int(split_test[4])
            if test_rank > rank_limit:
                avg_bleu.append(avg_test_bleu)
                continue

        if fast:
            avg_test_bleu = score_vs_cooked(hyps[test],expected_ref,n=n,addprec=addprec)
        else:
            for ref in xrange(N):
                split_ref = splits[ref]
                factor = ref_probs[ref]
#                score=1.
                if ref != test: # each system gets to vote for itself
                    score=score_vs_cooked(hyps[test],cookedrefs[ref],n=n,addprec=addprec)
                else:
                    score=1.
                avg_test_bleu += ref_probs[ref]*score
        avg_bleu.append(avg_test_bleu)

        if len(max_items) == 0 or avg_test_bleu == avg_bleu[max_items[0]]:
            max_items.append(test)
        elif avg_test_bleu > avg_bleu[max_items[0]]:
            max_items = []
            max_items.append(test)
    dump(avg_bleu)
#    dump([x/avg_bleu[0] for x in avg_bleu])
    return max_items,avg_bleu
Esempio n. 4
0
def mbr_best(lines,
             nbest,
             cost_weighting,
             rank_limit,
             rank_weight,
             sys_weights,
             sys_cost_bases=None,
             fast=False,
             addprec=1,
             eff_ref_len="average",
             n=4,
             cost_base=None,
             normalize_cost_base=False,
             per_system_norm=True):
    if sys_weights == None or len(sys_weights) == 0:
        no_syswt = True
        sys_weights = {}
    else:
        no_syswt = False
        sumsw = sum(sys_weights.itervalues())
        sys_weights = dict(
            (s, (w / sumsw)) for s, w in sys_weights.iteritems())

    use_cost_base = (cost_base != None or sys_cost_bases != None)
    if use_cost_base and sys_cost_bases == None:
        sys_cost_bases = {}

    if nbest:
        start = 6  # added system also to beginning
    else:
        start = 1

    entries_per_system = {}
    best_system_score = {
    }  # these are set by first/last in input, rather than assuming more positive -> better
    worst_system_score = {}
    max_system_cost = {}
    #    cookedrefs = []
    ref_probs = []
    ref_syss = []
    splits = [line.split(None, start) for line in lines]
    hyps = [bleu.precook(s[start]) for s in splits]
    sump_sys = {}
    for split_ref in splits:
        sysname = split_ref[0]
        entries_per_system[sysname] = entries_per_system.get(sysname, 0) + 1
        if nbest:
            score = float(split_ref[5])
            cost = -score
            if sysname not in best_system_score:
                best_system_score[sysname] = score
            worst_system_score[sysname] = score
            if max_system_cost.get(sysname, 0) < cost:
                max_system_cost[sysname] = cost

#    diff_system_score = dict((s,w-best_system_score[s]) for s,w in worst_system_score.iteritems())

    sump = 0
    for split_ref in splits:
        p = 1.0

        sysname = split_ref[0]

        if not per_system_norm and sysname in sys_weights:
            #            pdb.set_trace()
            p *= sys_weights[sysname]

        if nbest:
            if rank_weight != None:
                rank = int(split_ref[4])
                p *= 1. / (rank_weight + rank)
            if use_cost_base and sysname in best_system_score:
                bss = best_system_score[sysname]
                score = float(split_ref[5]) - bss
                if normalize_cost_base:
                    diff = bss - worst_system_score[sysname]
                    #diff=diff_system_score[sysname]
                    if diff != 0:
                        score /= diff
                p *= math.pow(sys_cost_bases.get(sysname, cost_base), score)
            if cost_weighting and max_system_cost.get(sysname, 0) > 0.0:
                cost = -float(split_ref[5])
                p *= cost / max_system_cost[sysname]
        sump += p
        sump_sys[sysname] = sump_sys.get(sysname, 0) + p
        ref_probs.append(p)
        ref_syss.append(sysname)

    if per_system_norm:
        if no_syswt:
            wsys = 1. / len(sump_sys)
            sys_weights = dict.fromkeys(sump_sys.iterkeys(), wsys)
#            dump(sys_weights)
        assert isclose(sum(sys_weights.itervalues()), 1)

        mult_sys = dict(
            (n, (sys_weights[n] / s)) for n, s in sump_sys.iteritems())
        #        dump([(sum([p for n,p in zip(ref_syss,ref_probs) if n==sn]),sn,sump_sys[sn],mult_sys[sn]) for sn in sump_sys.keys()])
        ref_probs = [
            mult_sys[sn] * p for sn, p in itertools.izip(ref_syss, ref_probs)
        ]
        dump(sum(ref_probs))
    elif sump != 1.0:
        oos = 1. / sump
        ref_probs = [p * oos for p in ref_probs]

    if fast:
        expected_ref = cook_expected_ref(hyps,
                                         n=n,
                                         p_refs=ref_probs,
                                         eff_ref_len=eff_ref_len)
    else:
        cookedrefs = [bleu.cook_refs([s[start]], n=n) for s in splits]

    max_items = []
    avg_bleu = []
    N = len(hyps)
    for test in xrange(N):
        split_test = splits[test]
        avg_test_bleu = 0.0

        if nbest and rank_limit != None:
            test_rank = int(split_test[4])
            if test_rank > rank_limit:
                avg_bleu.append(avg_test_bleu)
                continue

        if fast:
            avg_test_bleu = score_vs_cooked(hyps[test],
                                            expected_ref,
                                            n=n,
                                            addprec=addprec)
        else:
            for ref in xrange(N):
                split_ref = splits[ref]
                factor = ref_probs[ref]
                #                score=1.
                if ref != test:  # each system gets to vote for itself
                    score = score_vs_cooked(hyps[test],
                                            cookedrefs[ref],
                                            n=n,
                                            addprec=addprec)
                else:
                    score = 1.
                avg_test_bleu += ref_probs[ref] * score
        avg_bleu.append(avg_test_bleu)

        if len(max_items) == 0 or avg_test_bleu == avg_bleu[max_items[0]]:
            max_items.append(test)
        elif avg_test_bleu > avg_bleu[max_items[0]]:
            max_items = []
            max_items.append(test)
    dump(avg_bleu)
    #    dump([x/avg_bleu[0] for x in avg_bleu])
    return max_items, avg_bleu