def traverse(self, right_idx=0, right_widx=0, fsent=None, rules=None, nodememo=None): ''' helper called by dump(); returns a string; figure out span''' if nodememo is None: nodememo = {} if id(self) in nodememo: return deds = [(ded.dcost.dot(weights), ded) for ded in self.deds] deds.sort() deds = [x for _, x in deds[:max_edges_per_node]] self.deds = deds # prune! nedges = len(deds) # accumulating number of edges, recursively self.i = right_idx self.wi = right_widx for dedid, ded in enumerate(deds): try: rule = rules[ded.ruleid] except: print >> sys.stderr, "WARNING: rule %d not found" % ded.ruleid ## assuming it's a one-word UNKNOWN rule ## TODO: check with lattice unkword = fsent[self.wi] rule = 'UNKNOWN("@UNKNOWN@") -> "%s"' % unkword # in reverse order rules[ded.ruleid] = rule print >> sys.stderr, " covering " + unkword self.x = rule.split("(", 1)[0] # non-terminal label # analyse RHS (chinese side) lhs, rhs = rule.split(" -> ", 1) ## -> might be a word # deal with lhs; convert to ded.lhsstr = ["...", "...", Item(...), "..."] varid = 0 lhsstr = [] for child in ded.rule.e: if sym.isvar(child): lhsstr.append(ded.ants[varid]) varid += 1 else: lhsstr.append(quoteattr(sym.tostring(child))) # will be used in _dump() ded.lhsstr = lhsstr vars = [] chars_in_gap = 0 words_in_gap = 0 for it in reversed(rhs.split()): ## from RIGHT to LEFT!! N.B. can't split(" ") if it[0] == "x": #variable: var = int(it[1:]) vars.append((var, chars_in_gap, words_in_gap)) chars_in_gap = 0 words_in_gap = 0 else: # strip off quotes "..." it = it[1:-1] # calculate char-length if it == foreign_sentence_tag: # <foreign-sentence>: # glue symbol is not counted! chars_in_gap += 0 words_in_gap += 0 else: # 1 for word, len(...) for char chars_in_gap += len(words_to_chars(it, encode_back=True)) words_in_gap += 1 accumu = self.i ## left boundary waccumu = self.wi for i, c_gap, w_gap in vars: ##for sub in ded.ants: sub = ded.ants[i] if id(sub) not in nodememo: sub.traverse(accumu + c_gap, waccumu + w_gap, fsent, rules, nodememo) # accumulating # of edges (if first seen) nedges += nodememo[id(sub)][1] ## don't accumulate subs now; will do in another visit ## s += subs accumu = sub.j waccumu = sub.wj tmp_j = (ded.ants[vars[-1][0]].j if vars != [] else self.i) + chars_in_gap if self.j is not None and self.j != tmp_j: assert False, "@sentence %d, node #%s, %d %d != %d %s rule %d" % \ (opts.sentid, self.nodeid, self.i, self.j, tmp_j, self.x, ded.ruleid) self.j = tmp_j tmp_wj = (ded.ants[vars[-1][0]].wj if vars != [] else self.wi) + words_in_gap ## self.wj = tmp_wj self.id = len(nodememo) + 1 nodememo[id(self)] = (self.id, nedges)
def dump(self, rules=None, sid=1, fsent="<foreign-sentence>", byline="", reflines=[]): nodememo = {} # to keep track of sizes (# of nodes, # of edges) # forest id, foreign sentence (TODO: refs) fsent = fsent.split(" ") s = "%s\t%s\n" % (sid, " ".join(fsent)) + \ "%d\n" % len(reflines) + \ "".join(reflines) flen = len(words_to_chars(fsent, encode_back=True)) fwlen = len(fsent) reversed_fsent = list(reversed(fsent)) ## RIGHT TO LEFT if byline != "": self.traverse(0, 0, reversed_fsent, rules, nodememo) ## swap back self.adjust_spans(flen, fwlen) byline = byline.split(" ") byline_flen = self.i byline_fwlen = self.wi byline_f = fsent[:byline_fwlen] print >> logs, "clen (non-byline) = %d (%d)" % (flen, self.j - self.i) print >> logs, "wlen (non-byline) = %d (%d)" % (fwlen, self.wj - self.wi) print >> logs, "BYLINE = " + " ".join(byline_f) + \ " ### %d chars, %d words" % (byline_flen, byline_fwlen) assert len(words_to_chars(byline_f)) == byline_flen, "@sentence %d, BYLINE Error" % opts.sentid ## check consistency ## new rule/edge ## TOP("by" "line" x0:TOP) -> "BY" "LINE" x0 ### id=-1 byline_e = " ".join('"%s"' % w for w in byline) lhs = "TOP(" + byline_e + " x0:%s)" % self.x # "TOP" rhs = " ".join('"%s"' % w for w in byline_f) + " x0" # byline rule, id=-1 rid = -1 rules[rid] = "%s -> %s ### id=%d" % (lhs, rhs, rid) ## make david-style LHS david_lhs = [] for w in byline: david_lhs.append(sym.fromstring(w)) david_lhs.append(sym.setindex(dummylabel, 1)) ded = Deduction([self], rule.Rule(rid, rule.Phrase(david_lhs), rule.Phrase(david_lhs)),\ svector.Vector()) ded.lhsstr = byline_e.split() + [self] ## N.B.: dont forget "..." ded.ruleid = rid # new node on top of TOP oldtop = self self = Item(self.x, 0, flen, deds=[ded]) self.x = oldtop.x self.wi = 0 self.wj = fwlen self.id = len(nodememo)+1 nodememo[id(self)] = (self.id, nodememo[id(oldtop)][1]+1) #edges else: # establish node spans self.traverse(0, 0, reversed_fsent, rules, nodememo) # swap i,j self.adjust_spans(flen, fwlen) ## lhuang: the following is from hope.py ## be very careful about weights interpolation sg = sgml.Sentence(fsent) sg.fwords = fsent sg.refs = [refline.split(" ") for refline in reflines] if sg.refs: theoracle.input(sg, verbose=False) # 1-best self.reweight(weights) output(self, "1-best @ %s" % sid, onebestbleus, onebestscores) base_oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector for relative in []:#[opts.hope]: oracleweights = theoracle.make_weights(additive=True) oracleweights *= relative # interpolation: taking modelcost into account oracleweights += weights # compute oracle self.rescore(theoracle.models, oracleweights, add=True) # TODO: why?? output(self, "hope%s " % relative, hopebleus[relative], hopescores[relative]) # right boundary should match sentence length (in chars) assert self.j == flen and self.wj == fwlen, \ "@sentence %d, Boundary Mismatch at %s\t%s" % (opts.sentid, sid, fsent) + \ "self.j=%d, flen=%d; self.wj=%d, fwlen=%d" % (self.j, flen, self.wj, fwlen) s += "%d\t%d\n" % nodememo[id(self)] + \ self._dump(rules, deriv=self.viterbi_deriv()) return s