Example #1
0
 def __call__(self, states, lengths):
     utils.zcheck_matched_length(states, lengths, _forced=True)
     for beam, _l in zip(states, lengths):
         for one in beam:
             log_prob = LinearGaussain.log_gaussian(one.length, _l, self._sigma)
             log_prob *= self._alpha
             one.set_score_final(one.score_partial + log_prob)
Example #2
0
 def fit_once(fit_files):
     # first fitting a simple one: y = gaussian(ax+b, sigma), here not including xenc for that will be too large
     with utils.Timer(tag="Fit-length-once", print_date=True):
         # 1. collect length
         with utils.zopen(fit_files[0]) as f0, utils.zopen(fit_files[1]) as f1:
             # todo(warn): plus one for the <eos> tokens
             x = [LinearGaussain.trans_len(len(_l.split())+1) for _l in f0]
             y = [LinearGaussain.trans_len(len(_l.split())+1) for _l in f1]
         utils.zcheck_matched_length(x, y, _forced=True)
         ll = len(x)
         x1, y1 = np.array(x, dtype=np.float32).reshape((-1,1)), np.array(y, dtype=np.float32)
         # 2. fit linear model
         try:    # todo(warn)
             regr = linear_model.LinearRegression()
             regr.fit(x1, y1)
             a, b = float(regr.coef_[0]), float(regr.intercept_)
         except:
             utils.zlog("Cannot linear-regression, skip that.")
             a, b = 1., 0.
         # 3. fit sigma
         x1 = x1.reshape((-1,))
         errors = a*x1+b - y1
         mu = np.mean(errors)
         sigma = np.sqrt(((errors - mu)**2).mean())
         ret = (a, b, sigma, mu)
         del x, y, x1, y1
         utils.zlog("Fitting Length with %s sentences and get %s." % (ll, ret), func="score")
     return ret
Example #3
0
 def __call__(self, states, lengths):
     utils.zcheck_matched_length(states, lengths, _forced=True)
     for beam, _l in zip(states, lengths):
         for one in beam:
             # todo(warn): mu+2*si -> 95%
             el = min(one.length, _l+2*self._sigma)
             one.set_score_final(one.score_partial + el*self._alpha)
Example #4
0
 def bleu4(length_gold, length_pred, counts_hit, counts_all, smoothing=False, report=False, ss_short=False):
     s, cc = 0., 0
     them = {}
     bp = BleuCalculator.brevity_penalty(length_gold, length_pred)
     them["bp"] = bp
     utils.zcheck_matched_length(counts_hit, counts_all)
     for h, a in zip(counts_hit, counts_all):
         if cc>0 and smoothing:
             # +1 smooth for n>1 maybe # todo(warn) may result in 0/*/*/*
             vv = (h+1)/(a+1)
         else:
             vv = h/a
         them[cc] = vv
         if vv <= 0:
             utils.zlog("Zero 1-gram counts !!", func="warn")
             s += utils.Constants.MIN_V
         else:
             s += math.log(vv)
         cc += 1
     s /= cc
     bleu = bp * math.exp(s)
     them["bleu"] = bleu
     ss = None
     if report:
         # utils.zlog("BLEU4-Counts: %s-%s" % (counts_hit, counts_all))
         ss = "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, hyp_len=%d, ref_len=%d)" \
               % (them["bleu"]*100, them[0]*100, them[1]*100, them[2]*100, them[3]*100, bp, length_pred, length_gold)
         utils.zlog(ss)
     if ss_short:
         ss = "%.2f(BP=%.3f,L=%d)" % (them["bleu"]*100, bp, length_pred)
     return bleu, ss
Example #5
0
 def rerange(self, nexts):
     # nexts is list of list of (prev-idx, next_token), return orders, next_ys
     utils.zcheck_matched_length(nexts, self.shapes)
     bv_orders = []
     new_ys = []
     break_bv, break_bi = False, False
     for i in range(len(nexts)):
         nns = nexts[i]
         bas = self.bases[i]
         if len(nns) != self.shapes[i]:
             break_bi = True
         for one in nns:
             j = one[0]
             if (len(bv_orders) == 0 and bas + j != 0) or (
                     len(bv_orders) > 0 and bas + j - 1 != bv_orders[-1]):
                 break_bv = True
             bv_orders.append(bas + j)
             new_ys.append(one[1])
     if len(bv_orders) != self.bases[-1]:
         break_bv, break_bi = True, True
     # rebuild
     self._build(nexts)
     # return
     if break_bi:
         return bv_orders, bv_orders, new_ys
     elif break_bv:
         return bv_orders, None, new_ys
     else:
         return None, None, new_ys
Example #6
0
 def _restore_order(self, tracking_list, x):
     if not tracking_list:
         return x
     # todo(warn): BatchArranger.restore_order
     utils.zcheck_matched_length(tracking_list, x, _forced=True)
     ret = [None for _ in x]
     for idx, one in zip(tracking_list, x):
         utils.zcheck_type(ret[idx],
                           type(None),
                           "Wrong tracking list, internal error!!",
                           _forced=True)
         ret[idx] = one
     return ret
Example #7
0
 def add_clipped_counts(golds, preds, n, on_words=False):
     if on_words:
         get_list_ff = lambda x, i: x.get_words(i)
     else:
         # remember to remove EOS
         get_list_ff = lambda x, i: x[i][:-1]
     # preds is list of TextInstance(multi), golds is list of TextInstance as the References
     for ones in preds:
         utils.zcheck_matched_length(ones, golds)
     # countings
     for ones in preds:
         for one_pred, one_gold in zip(ones, golds):
             l_pred, l_gold = len(one_pred), len(one_gold)
             # count gold ngrams
             ngrams_gold = [defaultdict(int) for _ in range(n)]
             lengths_gold = []
             for i in range(l_gold):
                 one_list = get_list_ff(one_gold, i)
                 lengths_gold.append(len(one_list))
                 ngrams_one_gold = BleuCalculator.count_ngrams(one_list, n)
                 for onegram_gold, onegram_one_gold in zip(ngrams_gold, ngrams_one_gold):
                     for k in onegram_one_gold:
                         onegram_gold[k] = max(onegram_gold[k], onegram_one_gold[k])
             # count pred ngrams and store
             infos, infos2, sbs = [], [], []
             for i in range(l_pred):
                 one_list = get_list_ff(one_pred, i)
                 ngrams_one_pred = BleuCalculator.count_ngrams(one_list, n)
                 # the informations to be collected (0:nearest length, 1,2,3,4, ngram-match-count)
                 crl = BleuCalculator.closest_ref_length(lengths_gold, len(one_list))
                 info, info2 = [crl], [len(one_list)]
                 for _i in range(n):
                     info2.append(max(info2[0]-_i, 0))
                 for onegram_gold, onegram_one_pred in zip(ngrams_gold, ngrams_one_pred):
                     cc = 0
                     for k in onegram_one_pred:
                         cc += min(onegram_gold[k], onegram_one_pred[k])
                     info.append(cc)
                 one_sb = BleuCalculator.bleu4(info[0], info2[0], info[1:], info2[1:], smoothing=True, ss_short=True)
                 sbs.append(one_sb)
                 infos.append(info)
                 infos2.append(info2)
             one_pred.set("stat", infos)
             one_pred.set("stat2", infos2)
             one_pred.set("sb", sbs)
Example #8
0
def main():
    # init
    opts = mt_args.init("rerank")
    # special readings from args for re-ranking mode
    # only accept spaced (multi-mode) nbest files for target & non-multi for golds
    # 1. data (only accepting nbest files)
    source_dict, target_dict = Vocab.read(opts["dicts"][0]), Vocab.read(
        opts["dicts"][1])
    dicts = [source_dict] + [target_dict for _ in opts["test"][1:]]
    test_iter = get_arranger_simple(opts["test"],
                                    dicts,
                                    multis=[False] +
                                    [True for _ in opts["test"][1:]],
                                    batch_size=opts["test_batch_size"])
    gold_iter = get_arranger_simple(opts["gold"],
                                    [target_dict for _ in opts["gold"]],
                                    multis=False,
                                    batch_size=opts["test_batch_size"])
    utils.zcheck_matched_length(test_iter, gold_iter)
    # 2. model
    mm = []
    try:
        for mn in opts["models"]:
            x = mt_mt.s2sModel(
                opts, source_dict, target_dict,
                None)  # rebuild from opts, thus use the same opts when testing
            try:
                x.load(mn)
            except:
                utils.zlog("Load model error %s!" % mn, func="warn")
            mm.append(x)
    except:
        pass
    # 3. analysis
    if len(mm) == 0:
        utils.zlog("No models specified, only analysing!", func="warn")
        num_test = len(opts["test"]) - 1
        golds = []
        srcs = []
        preds = [[] for _ in range(num_test)]
        for one in gold_iter.arrange_batches():
            golds += one
        for one in test_iter.arrange_batches():
            for zz in one:
                zzs = zz.extract()
                srcs.append(zzs[0])
                for i in range(num_test):
                    preds[i].append(zzs[i + 1])
        Analyzer.analyse(srcs, golds, preds, kbests=opts["rr_analysis_kbests"])
    # 4. rerank
    else:
        utils.zlog("=== Start to rerank ===", func="info")
        with utils.Timer(tag="Reranking", print_date=True):
            mt_decode(None,
                      test_iter,
                      mm,
                      target_dict,
                      opts,
                      opts["output"],
                      gold_iter=gold_iter)
        utils.zlog("=== End reranking, write to %s ===" % opts["output"],
                   func="info")
        mt_eval.evaluate(opts["output"], opts["gold"][0], opts["eval_metric"])