def cook_expected_ref(refs,n=4,p_refs=None,eff_ref_len="average"): '''same input as bleu.cook_refs, but maxcounts is avg. rather than max. p_refs is a parallel (to refs) list of weights. uniform 1/n will be used if None''' if p_refs == None: l=len(refs) p_refs=[1.0/l]*l assert isclose(sum(p_refs),1) reflen = [] # ecount=collections.defaultdict(float) ecount = {} for ref,p in itertools.izip(refs,p_refs): rl, counts = bleu.precook(ref, n) reflen.append(rl) for (ngram,count) in counts.iteritems(): #ecounts[ngram] += p*count ecount[ngram] = ecount.get(ngram,0)+p*count return collapse_reflen(reflen,eff_ref_len),ecount
def cook_expected_ref(refs, n=4, p_refs=None, eff_ref_len="average"): '''same input as bleu.cook_refs, but maxcounts is avg. rather than max. p_refs is a parallel (to refs) list of weights. uniform 1/n will be used if None''' if p_refs == None: l = len(refs) p_refs = [1.0 / l] * l assert isclose(sum(p_refs), 1) reflen = [] # ecount=collections.defaultdict(float) ecount = {} for ref, p in itertools.izip(refs, p_refs): rl, counts = bleu.precook(ref, n) reflen.append(rl) for (ngram, count) in counts.iteritems(): #ecounts[ngram] += p*count ecount[ngram] = ecount.get(ngram, 0) + p * count return collapse_reflen(reflen, eff_ref_len), ecount
def mbr_best(lines, nbest, cost_weighting, rank_limit, rank_weight, sys_weights, sys_cost_bases=None,fast=False,addprec=1,eff_ref_len="average",n=4,cost_base=None,normalize_cost_base=False,per_system_norm=True): if sys_weights == None or len(sys_weights)==0: no_syswt=True sys_weights={} else: no_syswt=False sumsw=sum(sys_weights.itervalues()) sys_weights=dict((s,(w/sumsw)) for s,w in sys_weights.iteritems()) use_cost_base = (cost_base != None or sys_cost_bases != None) if use_cost_base and sys_cost_bases == None: sys_cost_bases = {} if nbest: start = 6 # added system also to beginning else: start = 1 entries_per_system = {} best_system_score = {} # these are set by first/last in input, rather than assuming more positive -> better worst_system_score = {} max_system_cost = {} # cookedrefs = [] ref_probs = [] ref_syss = [] splits=[line.split(None,start) for line in lines] hyps=[bleu.precook(s[start]) for s in splits] sump_sys = {} for split_ref in splits: sysname = split_ref[0] entries_per_system[sysname] = entries_per_system.get(sysname, 0) + 1 if nbest: score = float(split_ref[5]) cost = -score if sysname not in best_system_score: best_system_score[sysname]=score worst_system_score[sysname]=score if max_system_cost.get(sysname, 0) < cost: max_system_cost[sysname] = cost # diff_system_score = dict((s,w-best_system_score[s]) for s,w in worst_system_score.iteritems()) sump=0 for split_ref in splits: p = 1.0 sysname = split_ref[0] if not per_system_norm and sysname in sys_weights: # pdb.set_trace() p *= sys_weights[sysname] if nbest: if rank_weight != None: rank = int(split_ref[4]) p *= 1./(rank_weight + rank) if use_cost_base and sysname in best_system_score: bss=best_system_score[sysname] score = float(split_ref[5])-bss if normalize_cost_base: diff=bss-worst_system_score[sysname] #diff=diff_system_score[sysname] if diff!=0: score /= diff p *= math.pow(sys_cost_bases.get(sysname,cost_base),score) if cost_weighting and max_system_cost.get(sysname, 0) > 0.0: cost = -float(split_ref[5]) p *= cost/max_system_cost[sysname] sump+=p sump_sys[sysname]=sump_sys.get(sysname,0)+p ref_probs.append(p) ref_syss.append(sysname) if per_system_norm: if no_syswt: wsys=1./len(sump_sys) sys_weights=dict.fromkeys(sump_sys.iterkeys(),wsys) # dump(sys_weights) assert isclose(sum(sys_weights.itervalues()),1) mult_sys=dict((n,(sys_weights[n]/s)) for n,s in sump_sys.iteritems()); # dump([(sum([p for n,p in zip(ref_syss,ref_probs) if n==sn]),sn,sump_sys[sn],mult_sys[sn]) for sn in sump_sys.keys()]) ref_probs=[mult_sys[sn]*p for sn,p in itertools.izip(ref_syss,ref_probs)] dump(sum(ref_probs)) elif sump != 1.0: oos=1./sump ref_probs=[p*oos for p in ref_probs] if fast: expected_ref=cook_expected_ref(hyps,n=n,p_refs=ref_probs,eff_ref_len=eff_ref_len) else: cookedrefs = [ bleu.cook_refs([s[start]],n=n) for s in splits ] max_items = [] avg_bleu = [] N=len(hyps) for test in xrange(N): split_test = splits[test] avg_test_bleu = 0.0 if nbest and rank_limit != None: test_rank = int(split_test[4]) if test_rank > rank_limit: avg_bleu.append(avg_test_bleu) continue if fast: avg_test_bleu = score_vs_cooked(hyps[test],expected_ref,n=n,addprec=addprec) else: for ref in xrange(N): split_ref = splits[ref] factor = ref_probs[ref] # score=1. if ref != test: # each system gets to vote for itself score=score_vs_cooked(hyps[test],cookedrefs[ref],n=n,addprec=addprec) else: score=1. avg_test_bleu += ref_probs[ref]*score avg_bleu.append(avg_test_bleu) if len(max_items) == 0 or avg_test_bleu == avg_bleu[max_items[0]]: max_items.append(test) elif avg_test_bleu > avg_bleu[max_items[0]]: max_items = [] max_items.append(test) dump(avg_bleu) # dump([x/avg_bleu[0] for x in avg_bleu]) return max_items,avg_bleu
def mbr_best(lines, nbest, cost_weighting, rank_limit, rank_weight, sys_weights, sys_cost_bases=None, fast=False, addprec=1, eff_ref_len="average", n=4, cost_base=None, normalize_cost_base=False, per_system_norm=True): if sys_weights == None or len(sys_weights) == 0: no_syswt = True sys_weights = {} else: no_syswt = False sumsw = sum(sys_weights.itervalues()) sys_weights = dict( (s, (w / sumsw)) for s, w in sys_weights.iteritems()) use_cost_base = (cost_base != None or sys_cost_bases != None) if use_cost_base and sys_cost_bases == None: sys_cost_bases = {} if nbest: start = 6 # added system also to beginning else: start = 1 entries_per_system = {} best_system_score = { } # these are set by first/last in input, rather than assuming more positive -> better worst_system_score = {} max_system_cost = {} # cookedrefs = [] ref_probs = [] ref_syss = [] splits = [line.split(None, start) for line in lines] hyps = [bleu.precook(s[start]) for s in splits] sump_sys = {} for split_ref in splits: sysname = split_ref[0] entries_per_system[sysname] = entries_per_system.get(sysname, 0) + 1 if nbest: score = float(split_ref[5]) cost = -score if sysname not in best_system_score: best_system_score[sysname] = score worst_system_score[sysname] = score if max_system_cost.get(sysname, 0) < cost: max_system_cost[sysname] = cost # diff_system_score = dict((s,w-best_system_score[s]) for s,w in worst_system_score.iteritems()) sump = 0 for split_ref in splits: p = 1.0 sysname = split_ref[0] if not per_system_norm and sysname in sys_weights: # pdb.set_trace() p *= sys_weights[sysname] if nbest: if rank_weight != None: rank = int(split_ref[4]) p *= 1. / (rank_weight + rank) if use_cost_base and sysname in best_system_score: bss = best_system_score[sysname] score = float(split_ref[5]) - bss if normalize_cost_base: diff = bss - worst_system_score[sysname] #diff=diff_system_score[sysname] if diff != 0: score /= diff p *= math.pow(sys_cost_bases.get(sysname, cost_base), score) if cost_weighting and max_system_cost.get(sysname, 0) > 0.0: cost = -float(split_ref[5]) p *= cost / max_system_cost[sysname] sump += p sump_sys[sysname] = sump_sys.get(sysname, 0) + p ref_probs.append(p) ref_syss.append(sysname) if per_system_norm: if no_syswt: wsys = 1. / len(sump_sys) sys_weights = dict.fromkeys(sump_sys.iterkeys(), wsys) # dump(sys_weights) assert isclose(sum(sys_weights.itervalues()), 1) mult_sys = dict( (n, (sys_weights[n] / s)) for n, s in sump_sys.iteritems()) # dump([(sum([p for n,p in zip(ref_syss,ref_probs) if n==sn]),sn,sump_sys[sn],mult_sys[sn]) for sn in sump_sys.keys()]) ref_probs = [ mult_sys[sn] * p for sn, p in itertools.izip(ref_syss, ref_probs) ] dump(sum(ref_probs)) elif sump != 1.0: oos = 1. / sump ref_probs = [p * oos for p in ref_probs] if fast: expected_ref = cook_expected_ref(hyps, n=n, p_refs=ref_probs, eff_ref_len=eff_ref_len) else: cookedrefs = [bleu.cook_refs([s[start]], n=n) for s in splits] max_items = [] avg_bleu = [] N = len(hyps) for test in xrange(N): split_test = splits[test] avg_test_bleu = 0.0 if nbest and rank_limit != None: test_rank = int(split_test[4]) if test_rank > rank_limit: avg_bleu.append(avg_test_bleu) continue if fast: avg_test_bleu = score_vs_cooked(hyps[test], expected_ref, n=n, addprec=addprec) else: for ref in xrange(N): split_ref = splits[ref] factor = ref_probs[ref] # score=1. if ref != test: # each system gets to vote for itself score = score_vs_cooked(hyps[test], cookedrefs[ref], n=n, addprec=addprec) else: score = 1. avg_test_bleu += ref_probs[ref] * score avg_bleu.append(avg_test_bleu) if len(max_items) == 0 or avg_test_bleu == avg_bleu[max_items[0]]: max_items.append(test) elif avg_test_bleu > avg_bleu[max_items[0]]: max_items = [] max_items.append(test) dump(avg_bleu) # dump([x/avg_bleu[0] for x in avg_bleu]) return max_items, avg_bleu