def bleu4(length_gold, length_pred, counts_hit, counts_all, smoothing=False, report=False, ss_short=False): s, cc = 0., 0 them = {} bp = BleuCalculator.brevity_penalty(length_gold, length_pred) them["bp"] = bp utils.zcheck_matched_length(counts_hit, counts_all) for h, a in zip(counts_hit, counts_all): if cc>0 and smoothing: # +1 smooth for n>1 maybe # todo(warn) may result in 0/*/*/* vv = (h+1)/(a+1) else: vv = h/a them[cc] = vv if vv <= 0: utils.zlog("Zero 1-gram counts !!", func="warn") s += utils.Constants.MIN_V else: s += math.log(vv) cc += 1 s /= cc bleu = bp * math.exp(s) them["bleu"] = bleu ss = None if report: # utils.zlog("BLEU4-Counts: %s-%s" % (counts_hit, counts_all)) ss = "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, hyp_len=%d, ref_len=%d)" \ % (them["bleu"]*100, them[0]*100, them[1]*100, them[2]*100, them[3]*100, bp, length_pred, length_gold) utils.zlog(ss) if ss_short: ss = "%.2f(BP=%.3f,L=%d)" % (them["bleu"]*100, bp, length_pred) return bleu, ss
def fit_once(fit_files): # first fitting a simple one: y = gaussian(ax+b, sigma), here not including xenc for that will be too large with utils.Timer(tag="Fit-length-once", print_date=True): # 1. collect length with utils.zopen(fit_files[0]) as f0, utils.zopen(fit_files[1]) as f1: # todo(warn): plus one for the <eos> tokens x = [LinearGaussain.trans_len(len(_l.split())+1) for _l in f0] y = [LinearGaussain.trans_len(len(_l.split())+1) for _l in f1] utils.zcheck_matched_length(x, y, _forced=True) ll = len(x) x1, y1 = np.array(x, dtype=np.float32).reshape((-1,1)), np.array(y, dtype=np.float32) # 2. fit linear model try: # todo(warn) regr = linear_model.LinearRegression() regr.fit(x1, y1) a, b = float(regr.coef_[0]), float(regr.intercept_) except: utils.zlog("Cannot linear-regression, skip that.") a, b = 1., 0. # 3. fit sigma x1 = x1.reshape((-1,)) errors = a*x1+b - y1 mu = np.mean(errors) sigma = np.sqrt(((errors - mu)**2).mean()) ret = (a, b, sigma, mu) del x, y, x1, y1 utils.zlog("Fitting Length with %s sentences and get %s." % (ll, ret), func="score") return ret
def _argmax_list(ll, amount=10, cfs=(lambda x: x[0]>1e-5, lambda x: x[0]<=1e-5 and x[0]>=-1e-5, lambda x: x[0]<-1e-5), buckets=(0, 15, 30, 50, 1000), golds_len0=None): def _sort_and_report(_l2): rankings = sorted(_l2, key=lambda x: x[-1], reverse=True) for i in range(min(r, amount)): idx, content = rankings[i][0], rankings[i][1] utils.zlog("#%d Max-ranking, index is %s, content is %s." % (i, idx, content), func="details") # -- r = len(ll) rs = [] rs_descr = [] for cf in cfs: cc = sum(1 if cf(one) else 0 for one in ll) rs.append(cc) rs_descr.append("%d/%d/%.3f" % (cc, r, cc/r)) utils.zlog("Countings: %s" % (" ".join(rs_descr),)) _sort_and_report([(i,one) for i,one in enumerate(ll)]) # analyzing buckets of length of gold0 for i in range(len(buckets)-1): a, b = buckets[i], buckets[i+1] _rf = lambda x: a <= x and x < b ll2 = [] for i, one in enumerate(ll): if _rf(golds_len0[i]): ll2.append((i, one)) utils.zlog("Range [%d, %d): %d/%d/%.3f" % (a, b, len(ll2), r, len(ll2)/r), func="details") _sort_and_report(ll2)
def _get_lang(gold_fn): # current: cands = ["en", "fr", "de", "zh"] for c in cands: if c in gold_fn: return c zlog("Unknown target languages for evaluating!!", func="warn") return "en"
def _report_log(self): utils.zlog( "ResultLogger: %d/%.3f/%.3f/%.3f" % (self.num_insts, self.num_ends / self.num_insts, self.num_mends / self.num_insts, self.num_mends / self.num_ends)) with utils.zopen("length_sizes.txt", "w") as fd: _tmp_idx = 0 for zl, zs in zip(self.ls[0], self.ls[1]): _tmp_idx += 1 fd.write("%d %d " % (zl, zs)) if _tmp_idx % 100 == 0: fd.write("\n")
def __init__(self, model, xlen, xadd, xback, length_info): super(LinearGaussain, self).__init__(model) self.xlen = xlen # dim of src repr self.xadd = xadd # whether add xsrc self.xback = xback # whether prop back through xsrc if length_info is None: length_info = LinearGaussain._DEFAULT_INFO # params utils.zlog("Init lg with len-info %s" % (length_info,)) self.params["W"] = self._add_params((1, xlen), ) self.params["A"] = self._add_params((1, 1), init=np.array([length_info[0],], dtype=np.float32)) self.params["B"] = self._add_params((1,), init=np.array([length_info[1],], dtype=np.float32)) self.params["SI"] = self._add_params((1,), init=np.array([length_info[2],], dtype=np.float32))
def _validate_len(self, dev_iter): # sqrt error count = 0 loss = 0. with utils.Timer(tag="VALID-LEN", print_date=True) as et: utils.zlog("With lg as %s." % (self._mm.lg.obtain_params(), )) for insts in dev_iter.arrange_batches(): ys = [i[1] for i in insts] ylens = np.asarray([len(_y) for _y in ys]) count += len(ys) Model.new_graph() self._mm.refresh(False) preds = self._mm.predict_length(insts) loss += np.sum((preds - ylens)**2) return -loss / count
def main(): # init opts = mt_args.init("test") looping = opts["loop"] # 1. data source_dict, target_dict = Vocab.read(opts["dicts"][0]), Vocab.read(opts["dicts"][1]) # -- here usually no need for test[1], but for convenience ... if not looping: dicts = [source_dict] + [target_dict for _ in opts["test"][1:]] test_iter = get_arranger(opts["test"], dicts, multis=False, shuffling_corpus=False, shuflling_buckets=False, sort_prior=[0], batch_size=opts["test_batch_size"], maxibatch_size=-1, max_len=utils.Constants.MAX_V, min_len=0, one_len=opts["max_len"]+1, shuffling0=False) # 2. model mm = [] for mn in opts["models"]: x = s2sModel(opts, source_dict, target_dict, None) # rebuild from opts, thus use the same opts when testing x.load(mn) mm.append(x) if len(mm) == 0: utils.zlog("No models specified, must be testing mode?", func="warn") mm.append(s2sModel(opts, source_dict, target_dict, None)) # no loading, only for testing # 3. decode if not looping: utils.zlog("=== Start to decode ===", func="info") with utils.Timer(tag="Decoding", print_date=True): mt_decode(opts["decode_way"], test_iter, mm, target_dict, opts, opts["output"]) utils.zlog("=== End decoding, write to %s ===" % opts["output"], func="info") # todo(warn) forward-compatible evaluation if len(opts["test"]) > 1: gold = opts["test"][1] else: gold = opts["gold"][0] mt_eval.evaluate(opts["output"], gold, opts["eval_metric"]) else: ot = Outputter(opts) while True: utils.zlog("Enter the src to translate:") line = sys.stdin.readline() if len(line)==0: break # prepare one one_words = line.strip().split() one_idxes = Vocab.w2i(source_dict, one_words, add_eos=True, use_factor=False) one_inst = TextInstance([one_words], [one_idxes]) rs = mt_decode(opts["decode_way"], [one_inst], mm, target_dict, opts, opts["output"]) utils.zlog(ot.format(rs[0], target_dict, False, False))
def analyse(srcs, golds, preds, kbests, n=4, on_words=True): # mainly two goals: compare pred/oracle/gold & compare between preds BleuCalculator.add_clipped_counts(golds, preds, n, on_words=on_words) for curk in kbests: utils.zlog("Start for kbest: k==%s" % curk, func="time") for i, pred in enumerate(preds): utils.zlog("For file num %i" % i, func="time") BleuCalculator.analyse_single(golds, pred, curk, n) utils.zlog("", func="time") if len(preds) > 1: BleuCalculator.analyse_multi(golds, preds, 1, n) utils.zlog("", func="time")
def _eval_bleu(output, gold, process_gold, lowercase=False): dir_name = os.path.dirname(os.path.abspath(__file__)) restore_name = os.path.join(dir_name, "..", "scripts", "restore.sh") script_name = os.path.join(dir_name, "..", "scripts", "moses", "multi-bleu.perl") # zmt_name = os.path.join(dir_name, "..") # todo(warn) need to find mosesdecoder for restore: default $ZMT is znmt/../ # maybe preprocess # todo: special treatment for files with multiple references if str.isnumeric(gold[-1]): zlog( "Evaluating instead on %s to deal with multiple references of original %s." % (gold[:-1], gold), func="warn") gold = gold[:-1] elif process_gold: gold_res = "temp.somekindofhelpless.gold.restore" os.system("bash %s < %s > %s" % (restore_name, gold, gold_res)) gold = gold_res maybe_lc = "-lc" if lowercase else "" cmd = "bash %s < %s | perl %s %s %s" % (restore_name, output, script_name, maybe_lc, gold) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) line = p.stdout.readlines() zlog("Evaluating %s to %s." % (output, gold), func="info") zlog(str(line), func="score") b = float(line[-1].split()[2][:-1]) return b
def main(): # init opts = mt_args.init("train") # start to train # 1. obtain dictionaries source_corpus, target_corpus = opts["train"] source_dict, target_dict = None, None if not opts["rebuild_dicts"]: try: source_dict, target_dict = Vocab.read(opts["dicts"][0]), Vocab.read(opts["dicts"][1]) except: utils.zlog("Read dictionaries fail %s, rebuild them." % (opts["dicts"],), func="warn") if source_dict is None or target_dict is None: # rebuild the dictionaries from corpus source_dict = Vocab(fname=source_corpus, rthres=opts["dicts_rthres"], fthres=opts["dicts_fthres"]) target_dict = Vocab(fname=target_corpus, rthres=opts["dicts_rthres"], fthres=opts["dicts_fthres"]) # save dictionaries try: source_dict.write(opts["dicts"][0]) target_dict.write(opts["dicts"][1]) except: utils.zlog("Write dictionaries fail: %s, skip this step." % opts["dicts_final"], func="warn") # 2. corpus iterator shuffling0 = opts["shuffle_training_data_onceatstart"] sort_prior = {"src":[0], "trg":[1], "src-trg":[0,1], "trg-src":[1,0]}[opts["training_sort_type"]] train_iter = get_arranger(opts["train"], [source_dict, target_dict], multis=False, shuffling_corpus=opts["shuffle_training_data"], shuflling_buckets=opts["shuffle_training_data"], sort_prior=sort_prior, batch_size=opts["batch_size"], maxibatch_size=20, max_len=opts["max_len"]+1, min_len=2, one_len=opts["max_len"]+1, shuffling0=shuffling0) dev_iter = get_arranger(opts["dev"], [source_dict, target_dict], multis=False, shuffling_corpus=False, shuflling_buckets=False, sort_prior=[0], batch_size=opts["valid_batch_size"], maxibatch_size=-1, max_len=utils.Constants.MAX_V, min_len=0, one_len=opts["max_len"]+1, shuffling0=False) # 3. about model & trainer # <special one> fit a gaussian first length_info = LinearGaussain.fit_once(opts["train"]) # todo: train or dev? mm = s2sModel(opts, source_dict, target_dict, length_info) tt = MTTrainer(opts, mm) # trainer + training_progress if opts["reload"] and os.path.exists(opts["reload_model_name"]): tt.load(opts["reload_model_name"], opts["reload_training_progress"]) # 4. training tt.train(train_iter, dev_iter) utils.zlog("=== Training ok!! ===", func="info")
def report(self): utils.zlog("Outputter final: count=%s/%s, replaced=%s" % (self.inst_count, self.sent_count, self.replaced))
def _sort_and_report(_l2): rankings = sorted(_l2, key=lambda x: x[-1], reverse=True) for i in range(min(r, amount)): idx, content = rankings[i][0], rankings[i][1] utils.zlog("#%d Max-ranking, index is %s, content is %s." % (i, idx, content), func="details")
def global_prune_ngram_greedy(cand_states, rest_beam_size, sig_beam_size, thresh, penalty, ngram_n, ngram_range, pr_global_lreward=0., pr_global_nalpha=1.): # on sorted list, comparing according to normalized scores -- greedy pruning # todo: how could we compare diff length states? -> normalize partial score # todo: how to do pruning and sig-max (there might be crossings)? -> take the greedy way # _get_score_f = (lambda x: x.score_partial/x.length) _get_score_f = lambda x: (x.score_partial + pr_global_lreward * x. length) / (x.length**pr_global_nalpha) sig_ngram_maxs = {} # all-step max (for survived ones) sig_ngram_curnum = defaultdict(int) # last-step state lists for sig sig_ngram_allnum = defaultdict(int) # all-step state counts for sig temp_ret = [] ngram_range = max(0, ngram_range) # to be sure >= 0 # pruning for one in cand_states: if len(temp_ret) >= rest_beam_size: one.state("PR_BEAM") continue # ngram sigs, according to the listing them = one.get_path(maxlen=ngram_range) if len(them) > 0: this_pruned = False # pruning according to sig-size and thresh cur_sig = one.sig_ngram(ngram_n) flag_not_best = False if cur_sig in sig_ngram_maxs: this_score, high_score = _get_score_f(one), _get_score_f( sig_ngram_maxs[cur_sig]) if this_score <= high_score: # not the best until current flag_not_best = True if sig_ngram_allnum[cur_sig] >= sig_beam_size: one.state("PR_NGRAM_EXPAND") this_pruned = True elif this_score <= high_score - thresh: one.state("PR_NGRAM_DIFF") this_pruned = True # check cov for pruning if this_pruned: pruner_one = sig_ngram_maxs[cur_sig] if not Pruner.cov_checker.cov_ok(one, pruner_one): one.state("ZZ") one.tags("FAIL_PR_COV") this_pruned = False # adding if not this_pruned: # todo(warn) penalize here according to two criteria if penalty > 0.: one_score_cur = one.action_score() one_score_cur -= sig_ngram_curnum[cur_sig] * penalty if flag_not_best: one_score_cur -= penalty one.action_score(one_score_cur) # add all steps for this one for one_state in them: one_sig = one_state.sig_ngram(ngram_n) if one_sig not in sig_ngram_maxs or _get_score_f( one_state) > _get_score_f( sig_ngram_maxs[one_sig]): sig_ngram_maxs[one_sig] = one_state sig_ngram_allnum[one_sig] += 1 sig_ngram_curnum[cur_sig] += 1 # only last step temp_ret.append(one) else: pruner_one = sig_ngram_maxs[cur_sig] # set pruners and record in the sg one.set("PR_PRUNER", pruner_one) pruner_one.add_list("PRUNING_LIST", one) if one == pruner_one: utils.zlog("WHAT? Self-pruning?") else: # for example, the first several steps temp_ret.append(one) return temp_ret
def mt_decode(decode_way, test_iter, mms, target_dict, opts, outf, gold_iter=None): reranking = (gold_iter is not None) looping = isinstance(test_iter, Iterable) if reranking: cur_searcher = mt_search.search_rerank else: cur_searcher = { "greedy": mt_search.search_greedy, "beam": mt_search.search_beam, "sample": mt_search.search_sample, "branch": mt_search.search_branch }[decode_way] one_recorder = OnceRecorder("DECODE") num_sents = len(test_iter) cur_sents = 0. sstater = StateStater() if opts["decode_extract_paraf"]: para_extractor = ParafExtractor(opts, target_dict) else: para_extractor = DummyParafExtractor(opts) # decoding them all results = ResultLogger(outf, target_dict, opts) tracking_list = None prev_point = 0 # init normer for i, _m in enumerate(mms): _lg_params = _m.lg.obtain_params() utils.zlog("Model[%s] is with lg as %s." % ( i, _lg_params, )) _sigma = np.average([_m.lg.get_real_sigma() for _m in mms], axis=0) normer = get_normer(opts["normalize_way"], opts["normalize_alpha"], _sigma) # todo: ugly code here if looping: rs = cur_searcher(mms, test_iter, target_dict, opts, normer, sstater, para_extractor) results.add(rs) tracking_list = None else: if reranking: for one_tests, one_golds in zip(test_iter.arrange_batches(), gold_iter.arrange_batches()): if opts["verbose"] and (cur_sents - prev_point) >= ( opts["report_freq"] * test_iter.bsize()): utils.zlog("Reranking process: %.2f%%" % (cur_sents / num_sents * 100)) prev_point = cur_sents cur_sents += len(one_tests) mt_search.search_init() rs = cur_searcher(mms, [one_tests, one_golds], target_dict, opts, normer, sstater, para_extractor) results.add(rs) one_recorder.record(one_tests, {}, 0) else: for insts in test_iter.arrange_batches(): if opts["verbose"] and (cur_sents - prev_point) >= ( opts["report_freq"] * test_iter.bsize()): utils.zlog("Decoding process: %.2f%%" % (cur_sents / num_sents * 100)) prev_point = cur_sents cur_sents += len(insts) mt_search.search_init() # return list(batch) of list(beam) of states rs = cur_searcher(mms, insts, target_dict, opts, normer, sstater, para_extractor) results.add(rs) one_recorder.record(insts, {}, 0) one_recorder.report() tracking_list = test_iter.get_tracking_list() # restore from sorting by length # results = test_iter.restore_order(results) # output sstater.report() results.finish(tracking_list) para_extractor.save_parafs(outf) utils.zlog("COV-LOG: " + CovChecker.report()) if looping: return rs else: return None
def report(self, s=""): utils.zlog(s + self.state(), func="info") if self._mm is not None: self._mm.stat_report()
def analyse_single(golds, pred, kbest, n): # sentence-level bleu score & ranking (average or max) len_inst = len(pred) golds_len0 = [len(g[0]) for g in golds] # sentence level smoothed bleu score # only consider the first k items in the list # t1: averaged ones utils.zlog("t1: Averages") # -- t10: average sent-bleu cum = 0. for p in pred: lp = len(p) cc = min(kbest, lp) cum += sum([z[0] for z in p.get("sb")[:cc]]) / cc utils.zlog("t10: Average sentence BLEU of kbest(k=%s) ones: BLEU=%.3f" % (kbest, cum/len_inst)) # -- t11: average corpus-bleu count = 0 cum1, cum2 = [0]*(n+1), [0]*(n+1) for p in pred: lp = len(p) cc = min(kbest, lp) count += cc s1, s2 = p.get("stat"), p.get("stat2") for i in range(cc): utils.Helper.add_inplace_list(cum1, s1[i]) utils.Helper.add_inplace_list(cum2, s2[i]) utils.zlog("t11: Average corpus BLEU of kbest(k=%s) ones, but average count is %.3f." % (kbest, count/len_inst)) BleuCalculator.bleu4(cum1[0], cum2[0], cum1[1:], cum2[1:], report=True) # t2: oracle max (best sentence BLEU): influence on corpus-one-bleu utils.zlog("t2: About oracle max") # -- t20: how many is already oracle max as max & how much improvement of oracle-max # --- pred best p_cum1, p_cum2 = [0]*(n+1), [0]*(n+1) o_cum1, o_cum2 = [0]*(n+1), [0]*(n+1) hit_counts = 0 obest_ranks = [] for p in pred: lp = len(p) cc = min(kbest, lp) s1, s2, sbs = p.get("stat"), p.get("stat2"), p.get("sb") obest = int(np.argmax([z[0] for z in sbs[:cc]])) # record oracle best obest_ranks.append(obest) if obest == 0: hit_counts += 1 # record pred one utils.Helper.add_inplace_list(p_cum1, s1[0]) utils.Helper.add_inplace_list(p_cum2, s2[0]) # record oracle one utils.Helper.add_inplace_list(o_cum1, s1[obest]) utils.Helper.add_inplace_list(o_cum2, s2[obest]) utils.zlog("t20: oracle hit is %s/%s/%.3f; prediction & oracle" % (hit_counts, len_inst, hit_counts/len_inst)) bleu_base, _ = BleuCalculator.bleu4(p_cum1[0], p_cum2[0], p_cum1[1:], p_cum2[1:], report=True) BleuCalculator.bleu4(o_cum1[0], o_cum2[0], o_cum1[1:], o_cum2[1:], report=True) # -- if kbest > 1: sbleu_improves = [] # list of (score, p[0], p[oracle]) -> sentence bleu improves cbleu_improves = [] # list of (score, final) -> corpus bleu improves after replacing for oidx, p in zip(obest_ranks, pred): s1, s2, sbs = p.get("stat"), p.get("stat2"), p.get("sb") sbleu_improves.append((sbs[oidx][0]-sbs[0][0], sbs[0][-1], sbs[oidx][-1], "Oracle-Rank %s"%oidx)) repl_cum1, repl_cum2 = p_cum1.copy(), p_cum2.copy() utils.Helper.add_inplace_list(repl_cum1, s1[0], -1) utils.Helper.add_inplace_list(repl_cum1, s1[oidx], 1) utils.Helper.add_inplace_list(repl_cum2, s2[0], -1) utils.Helper.add_inplace_list(repl_cum2, s2[oidx], 1) bleu_change = BleuCalculator.bleu4(repl_cum1[0], repl_cum2[0], repl_cum1[1:], repl_cum2[1:]) cbleu_improves.append((bleu_change[0]-bleu_base, bleu_change[-1], sbs[0][-1], sbs[oidx][-1])) # -- t21: which ones improves most (sbleu) if replaced by oracle-max utils.zlog("t21: improves at sentence bleus") BleuCalculator._argmax_list(sbleu_improves, golds_len0=golds_len0) # -- t22: which ones improves most (replace-cbleu) if replaced by oracle-max utils.zlog("t22: improves at corpus bleus with replacing") BleuCalculator._argmax_list(cbleu_improves, golds_len0=golds_len0) # t3: improvements by gold (once enough: thus only when kbest==1) if kbest == 1: utils.zlog("t3: About gold replacing") sbleu_goldimpr = [] # list of (score, p[0]) -> sentence bleu improves cbleu_goldimpr = [] # list of (score, final) -> corpus bleu improves after replacing for p in pred: s1, s2, sbs = p.get("stat"), p.get("stat2"), p.get("sb") sbleu_goldimpr.append((1.0-sbs[0][0], sbs[0][-1])) repl_cum1, repl_cum2 = p_cum1.copy(), p_cum2.copy() replace_counts = [s2[0][0], ] + [max(0, s2[0][0]-i) for i in range(n)] utils.Helper.add_inplace_list(repl_cum1, s1[0], -1) utils.Helper.add_inplace_list(repl_cum1, replace_counts, 1) utils.Helper.add_inplace_list(repl_cum2, s2[0], -1) utils.Helper.add_inplace_list(repl_cum2, replace_counts, 1) bleu_change = BleuCalculator.bleu4(repl_cum1[0], repl_cum2[0], repl_cum1[1:], repl_cum2[1:]) cbleu_goldimpr.append((bleu_change[0]-bleu_base, bleu_change[-1], sbs[0][-1])) # -- t23: which ones improves most (sbleu) if replaced by gold utils.zlog("t31: gold_comparing at sentence bleus") BleuCalculator._argmax_list(sbleu_goldimpr, golds_len0=golds_len0) # -- t24: which ones improves most (replace-cbleu) if replaced by gold utils.zlog("t32: gold_comparing at corpus bleus with replacing") BleuCalculator._argmax_list(cbleu_goldimpr, golds_len0=golds_len0)
def analyse_multi(golds, preds, kbest, n): # todo(warn): only compares the pred[0] golds_len0 = [len(g[0]) for g in golds] if kbest > 1: pass else: # which one get the best max results utils.zlog("mt0: About comparing the predicted best ones") # - first get best sbleu (notice that this only takes the p[0], and the index is on the preds list) their_cums1 = [[0]*(n+1) for _ in range(len(preds))] their_cums2 = [[0]*(n+1) for _ in range(len(preds))] max_idxes = [] num_pred = len(preds) len_inst = len(preds[0]) for i in range(len_inst): their_results = [] for j in range(num_pred): p = preds[j][i] s1, s2, sbs = p.get("stat"), p.get("stat2"), p.get("sb") utils.Helper.add_inplace_list(their_cums1[j], s1[0]) utils.Helper.add_inplace_list(their_cums2[j], s2[0]) their_results.append(sbs[0]) # argmax max_idx = BleuCalculator._cmp(their_results) max_idxes.append(max_idx) num_equal = sum(1 if one is None else 0 for one in max_idxes) utils.zlog("Specifically, equal rate is: %d/%d/%.3f" % (num_equal, len_inst, num_equal/len_inst)) # analyzing for each preds for j in range(num_pred): num_hit = sum(1 if j==one else 0 for one in max_idxes) num_good = num_hit + num_equal utils.zlog("Specifically, for file #%s: %d/%d(%.3f)/%d(%.3f)" % (j, num_hit, len_inst, num_hit/len_inst, num_good, num_good/len_inst)) bleu_base, _ = BleuCalculator.bleu4(their_cums1[j][0], their_cums2[j][0], their_cums1[j][1:], their_cums2[j][1:], report=True) sbleu_improves = [] # list of (score, p[0], best[0]) -> sentence bleu improves cbleu_improves = [] # list of (score, final) -> corpus bleu improves after replacing cur_ii = 0 for oidx, p in zip(max_idxes, preds[j]): if oidx is None: # equal, count as self oidx = j pbest = preds[oidx][cur_ii] s1, s2, sbs = p.get("stat"), p.get("stat2"), p.get("sb") b1, b2, bbs = pbest.get("stat"), pbest.get("stat2"), pbest.get("sb") if_in_idx = None for _i, _item in enumerate(sbs): if bbs[0] == _item: if_in_idx = _i sbleu_improves.append((bbs[0][0]-sbs[0][0], sbs[0][-1], bbs[0][-1], "Here-Rank %s" % if_in_idx)) repl_cum1, repl_cum2 = their_cums1[j].copy(), their_cums2[j].copy() utils.Helper.add_inplace_list(repl_cum1, s1[0], -1) utils.Helper.add_inplace_list(repl_cum1, b1[0], 1) utils.Helper.add_inplace_list(repl_cum2, s2[0], -1) utils.Helper.add_inplace_list(repl_cum2, b2[0], 1) bleu_change = BleuCalculator.bleu4(repl_cum1[0], repl_cum2[0], repl_cum1[1:], repl_cum2[1:]) cbleu_improves.append((bleu_change[0]-bleu_base, bleu_change[-1], sbs[0][-1])) cur_ii += 1 # -- mt01: which ones improves most (sbleu) if replaced by best-sbleu one utils.zlog("mt01: best_comparing at sentence bleus (improves from this one to the best)") BleuCalculator._argmax_list(sbleu_improves, golds_len0=golds_len0) # -- mt02: which ones improves most (replace-cbleu) if replaced by best-sbleu one utils.zlog("mt02: best_comparing at corpus bleus with replacing (improves from this one to the best)") BleuCalculator._argmax_list(cbleu_improves, golds_len0=golds_len0)
def main(): # init opts = mt_args.init("rerank") # special readings from args for re-ranking mode # only accept spaced (multi-mode) nbest files for target & non-multi for golds # 1. data (only accepting nbest files) source_dict, target_dict = Vocab.read(opts["dicts"][0]), Vocab.read( opts["dicts"][1]) dicts = [source_dict] + [target_dict for _ in opts["test"][1:]] test_iter = get_arranger_simple(opts["test"], dicts, multis=[False] + [True for _ in opts["test"][1:]], batch_size=opts["test_batch_size"]) gold_iter = get_arranger_simple(opts["gold"], [target_dict for _ in opts["gold"]], multis=False, batch_size=opts["test_batch_size"]) utils.zcheck_matched_length(test_iter, gold_iter) # 2. model mm = [] try: for mn in opts["models"]: x = mt_mt.s2sModel( opts, source_dict, target_dict, None) # rebuild from opts, thus use the same opts when testing try: x.load(mn) except: utils.zlog("Load model error %s!" % mn, func="warn") mm.append(x) except: pass # 3. analysis if len(mm) == 0: utils.zlog("No models specified, only analysing!", func="warn") num_test = len(opts["test"]) - 1 golds = [] srcs = [] preds = [[] for _ in range(num_test)] for one in gold_iter.arrange_batches(): golds += one for one in test_iter.arrange_batches(): for zz in one: zzs = zz.extract() srcs.append(zzs[0]) for i in range(num_test): preds[i].append(zzs[i + 1]) Analyzer.analyse(srcs, golds, preds, kbests=opts["rr_analysis_kbests"]) # 4. rerank else: utils.zlog("=== Start to rerank ===", func="info") with utils.Timer(tag="Reranking", print_date=True): mt_decode(None, test_iter, mm, target_dict, opts, opts["output"], gold_iter=gold_iter) utils.zlog("=== End reranking, write to %s ===" % opts["output"], func="info") mt_eval.evaluate(opts["output"], opts["gold"][0], opts["eval_metric"])