def beam_search(self, forest, b=1): if FLAGS.futurecost: forest.bestparse(LMState.weights + Vector("lm1=%s" % (LMState.weights["lm"] * FLAGS.lmratio))) sc, tr, fv = forest.root.bestres forest.bleu.rescore(tr) print >> logs, "1-best score: %.3f, bleu: %s" % (sc, forest.bleu.score_ratio_str()) self.firstpassscore += sc self.firstpassbleus += forest.bleu self.num_states = self.num_edges = 0 self.num_stacks = 0 self.final_items = [] self.best = None beams = defaultdict(dict if not FLAGS.newbeam else list) # +inf self.beams = beams self.max_step = -1 self.add_state(LMState.start_state(forest.root) if not FLAGS.taro else TaroState.start_state(forest.root)) # initial state self.nstates = 0 # space complexity self.nedges = 0 # time complexity i = 0 while i <= self.max_step: if not FLAGS.newbeam: # N.B.: values, not keys! (keys may not be updated) curr_beam = sorted(beams[i].values())[:b] # beam pruning, already uniq else: buf = sorted(beams[i])[:b] # beam pruning, not uniq curr_beam = [] uniq = {} uniq_stack = {} for item in buf: if item not in uniq: uniq[item] = item curr_beam.append(item) uniq_stack[item.stack] = item self.num_stacks += len(uniq_stack) self.num_states += len(curr_beam) if FLAGS.debuglevel >= 1: print >> logs, "beam %d, %d states" % (i, len(curr_beam)) print >> logs, "\n".join([str(x) for x in curr_beam]) print >> logs for old in curr_beam: if old.is_final(): self.final_items.append(old) else: for new in old.predict(): self.add_state(new) if FLAGS.complete: for new in old.complete(): self.add_state(new) i += 1 self.final_items.sort() return self.final_items[0], self.final_items[:b]
def main(): weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() LMState.init(lm, weights) decoder = Decoder() tot_bleu = Bleu() tot_score = 0. tot_time = 0. tot_len = tot_fnodes = tot_fedges = 0 tot_steps = tot_states = tot_edges = tot_stacks = 0 for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1): t = time.time() best, final_items = decoder.beam_search(forest, b=FLAGS.beam) score, trans, fv = best.score, best.trans(), best.get_fvector() t = time.time() - t tot_time += t tot_score += score forest.bleu.rescore(trans) tot_bleu += forest.bleu fnodes, fedges = forest.size() tot_len += len(forest.sent) tot_fnodes += fnodes tot_fedges += fedges tot_steps += decoder.max_step tot_states += decoder.num_states tot_edges += decoder.num_edges tot_stacks += decoder.num_stacks print >> logs, ("sent %d, b %d\tscore %.4f\tbleu+1 %s" + \ "\ttime %.3f\tsentlen %-3d fnodes %-4d fedges %-5d\tstep %d states %d edges %d stacks %d") % \ (i, FLAGS.beam, score, forest.bleu.score_ratio_str(), t, len(forest.sent), fnodes, fedges, decoder.max_step, decoder.num_states, decoder.num_edges, decoder.num_stacks) if FLAGS.k > 1 or FLAGS.forest: lmforest = best.toforest(forest) if FLAGS.forest: lmforest.dump() if FLAGS.k > 1: lmforest.lazykbest(FLAGS.k, weights=weights) klist = lmforest.root.klist if not FLAGS.mert: for j, (sc, tr, fv) in enumerate(klist, 1): print >> logs, "k=%d score=%.4f fv=%s\n%s" % (j, sc, fv, tr) else: klist = [(best.score, best.trans(), best.get_fvector())] if FLAGS.mert: # <score>... <hyp> ... print >> logs, '<sent No="%d">' % i print >> logs, "<Chinese>%s</Chinese>" % " ".join(forest.cased_sent) for sc, tr, fv in klist: print >> logs, "<score>%.3lf</score>" % sc print >> logs, "<hyp>%s</hyp>" % tr print >> logs, "<cost>%s</cost>" % fv print >> logs, "</sent>" if not FLAGS.forest: print trans print >> logs, "avg %d sentences, first pass score: %.4f, bleu: %s" % \ (i, decoder.firstpassscore/i, decoder.firstpassbleus.score_ratio_str()) print >> logs, ("avg %d sentences, b %d\tscore %.4lf\tbleu %s\ttime %.3f" + \ "\tsentlen %.1f fnodes %.1f fedges %.1f\tstep %.1f states %.1f edges %.1f stacks %.1f") % \ (i, FLAGS.beam, tot_score/i, tot_bleu.score_ratio_str(), tot_time/i, tot_len/i, tot_fnodes/i, tot_fedges/i, tot_steps/i, tot_states/i, tot_edges/i, tot_stacks/i) print >> logs, LMState.cachehits, LMState.cachemiss