def make_forest(fieldss): nodes = {} goal_ids = set() for fields in fieldss: node_id = fields['hyp'] if node_id not in nodes: nodes[node_id] = forest.Item(sym.fromtag('PHRASE'), 0, 0, []) node = nodes[node_id] if node_id == 0: r = rule.Rule(sym.fromtag('PHRASE'), rule.Phrase([]), rule.Phrase([])) node.deds.append(forest.Deduction((), r, svector.Vector())) else: m = scores_re.match(fields['scores']) core_values = [float(x) for x in m.group(1).split(',')] dcost = svector.Vector(m.group(2).encode('utf8')) for i, x in enumerate(core_values): dcost["_core%d" % i] = x back = int(fields['back']) ant = nodes[back] f = fields['src-phrase'].encode('utf8').split() e = fields['tgt-phrase'].encode('utf8').split() if len(f) != int(fields['cover-end']) - int(fields['cover-start']) + 1: sys.stderr.write("warning: French phrase length didn't match covered length\n") f = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + f) e = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + e) r = rule.Rule(sym.fromtag('PHRASE'), f, e) ded = forest.Deduction((ant,), r, dcost) node.deds.append(ded) if int(fields['forward']) < 0: # goal goal_ids.add(node_id) goal = forest.Item(None, 0, 0, []) for node_id in goal_ids: goal.deds.append(forest.Deduction((nodes[node_id],), None, svector.Vector())) return goal
sub_j2 -= j1 # Check English holes # can't lie outside phrase if sub_j1 < 0 or sub_j2 > elen: return None # can't overlap for j in xrange(sub_j1, sub_j2): if type(ewords[j]) is tuple or ewords[j] is None: return None # Set first eword to var, rest to None # We'll clean up the Nones later v = sym.setindex(sub_x, index) fwords[i] = v fpos[i] = (sub_i1, sub_i2) ewords[sub_j1] = (v, sub_j1 + j1, sub_j2 + j1) for j in xrange(sub_j1 + 1, sub_j2): ewords[j] = None index += 1 # Require an aligned French word if opts.require_aligned_terminal and not flag: return None epos = [] new_ewords = [] for i in xrange(elen): if ewords[i] is not None:
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]): """Currently this assumes that the only frontier nodes in the tree are words.""" while True: try: tok = tokiter.next() toktype = tok[0] except StopIteration: raise TreeFormatException("incomplete tree") if toktype == "or": _, nodeid = tok deds = list(forest_from_text_helper(tokiter, memo, \ delete_words=delete_words)) node = Item(dummylabel, dummyi, dummyj, deds=deds) if nodeid: memo[nodeid] = node node.nodeid = nodeid yield node elif toktype == "nonterm": _, nodeid, ruleid, dcoststr = tok if ruleid == "": ruleid = dummylabel else: # lhuang: N.B.: sym.fromtag would re-alloc it xrs_ruleid = int(ruleid) ruleid = sym.fromtag(ruleid) #int(ruleid) # dcost = svector.Vector() if dcoststr: # lhuang: features are read from forest, not rules # so there is no "e^..." or "10^..." for fv in dcoststr.split(','): f,v = fv.split(':',1) v = float(v) dcost[f] = v ants = [] rhs = [] vi = 1 for child in forest_from_text_helper(tokiter, memo, want_item=True,\ delete_words=delete_words): if isinstance(child, Item): ants.append(child) rhs.append(sym.setindex(dummylabel, vi)) vi += 1 else: rhs.append(child) r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs)) node = Deduction(ants=ants, rule=r, dcost=dcost) node.ruleid = xrs_ruleid if want_item: # need to insert OR node node = Item(dummylabel, dummyi, dummyj, deds=[node]) if nodeid: memo[nodeid] = node yield node elif toktype == 'term': terminal = tok[1] if terminal not in delete_words: yield sym.fromstring(terminal) elif toktype == 'ref': yield memo[tok[1]] elif toktype == 'pop': return else: raise TreeFormatException("unknown token %s" % (tok,))
def dump(self, rules=None, sid=1, fsent="<foreign-sentence>", byline="", reflines=[]): nodememo = {} # to keep track of sizes (# of nodes, # of edges) # forest id, foreign sentence (TODO: refs) fsent = fsent.split(" ") s = "%s\t%s\n" % (sid, " ".join(fsent)) + \ "%d\n" % len(reflines) + \ "".join(reflines) flen = len(words_to_chars(fsent, encode_back=True)) fwlen = len(fsent) reversed_fsent = list(reversed(fsent)) ## RIGHT TO LEFT if byline != "": self.traverse(0, 0, reversed_fsent, rules, nodememo) ## swap back self.adjust_spans(flen, fwlen) byline = byline.split(" ") byline_flen = self.i byline_fwlen = self.wi byline_f = fsent[:byline_fwlen] print >> logs, "clen (non-byline) = %d (%d)" % (flen, self.j - self.i) print >> logs, "wlen (non-byline) = %d (%d)" % (fwlen, self.wj - self.wi) print >> logs, "BYLINE = " + " ".join(byline_f) + \ " ### %d chars, %d words" % (byline_flen, byline_fwlen) assert len(words_to_chars(byline_f)) == byline_flen, "@sentence %d, BYLINE Error" % opts.sentid ## check consistency ## new rule/edge ## TOP("by" "line" x0:TOP) -> "BY" "LINE" x0 ### id=-1 byline_e = " ".join('"%s"' % w for w in byline) lhs = "TOP(" + byline_e + " x0:%s)" % self.x # "TOP" rhs = " ".join('"%s"' % w for w in byline_f) + " x0" # byline rule, id=-1 rid = -1 rules[rid] = "%s -> %s ### id=%d" % (lhs, rhs, rid) ## make david-style LHS david_lhs = [] for w in byline: david_lhs.append(sym.fromstring(w)) david_lhs.append(sym.setindex(dummylabel, 1)) ded = Deduction([self], rule.Rule(rid, rule.Phrase(david_lhs), rule.Phrase(david_lhs)),\ svector.Vector()) ded.lhsstr = byline_e.split() + [self] ## N.B.: dont forget "..." ded.ruleid = rid # new node on top of TOP oldtop = self self = Item(self.x, 0, flen, deds=[ded]) self.x = oldtop.x self.wi = 0 self.wj = fwlen self.id = len(nodememo)+1 nodememo[id(self)] = (self.id, nodememo[id(oldtop)][1]+1) #edges else: # establish node spans self.traverse(0, 0, reversed_fsent, rules, nodememo) # swap i,j self.adjust_spans(flen, fwlen) ## lhuang: the following is from hope.py ## be very careful about weights interpolation sg = sgml.Sentence(fsent) sg.fwords = fsent sg.refs = [refline.split(" ") for refline in reflines] if sg.refs: theoracle.input(sg, verbose=False) # 1-best self.reweight(weights) output(self, "1-best @ %s" % sid, onebestbleus, onebestscores) base_oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector for relative in []:#[opts.hope]: oracleweights = theoracle.make_weights(additive=True) oracleweights *= relative # interpolation: taking modelcost into account oracleweights += weights # compute oracle self.rescore(theoracle.models, oracleweights, add=True) # TODO: why?? output(self, "hope%s " % relative, hopebleus[relative], hopescores[relative]) # right boundary should match sentence length (in chars) assert self.j == flen and self.wj == fwlen, \ "@sentence %d, Boundary Mismatch at %s\t%s" % (opts.sentid, sid, fsent) + \ "self.j=%d, flen=%d; self.wj=%d, fwlen=%d" % (self.j, flen, self.wj, fwlen) s += "%d\t%d\n" % nodememo[id(self)] + \ self._dump(rules, deriv=self.viterbi_deriv()) return s
sub_j2 -= j1 # Check English holes # can't lie outside phrase if sub_j1 < 0 or sub_j2 > elen: return None # can't overlap for j in xrange(sub_j1,sub_j2): if type(ewords[j]) is tuple or ewords[j] is None: return None # Set first eword to var, rest to None # We'll clean up the Nones later v = sym.setindex(sub_x, index) fwords[i] = v fpos[i] = (sub_i1,sub_i2) ewords[sub_j1] = (v, sub_j1+j1, sub_j2+j1) for j in xrange(sub_j1+1,sub_j2): ewords[j] = None index += 1 # Require an aligned French word if opts.require_aligned_terminal and not flag: return None epos = [] new_ewords = [] for i in xrange(elen): if ewords[i] is not None:
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]): """Currently this assumes that the only frontier nodes in the tree are words.""" while True: try: tok = tokiter.next() toktype = tok[0] except StopIteration: raise TreeFormatException("incomplete tree") if toktype == "or": _, nodeid = tok deds = list( forest_from_text_helper(tokiter, memo, delete_words=delete_words)) node = Item(dummylabel, dummyi, dummyj, deds=deds) if nodeid: memo[nodeid] = node yield node elif toktype == "nonterm": _, nodeid, ruleid, dcoststr = tok if ruleid == "": ruleid = dummylabel else: ruleid = sym.fromtag(ruleid) dcost = svector.Vector() if dcoststr: for fv in dcoststr.split(','): f, v = fv.split(':', 1) v = float(v) dcost[f] = v ants = [] rhs = [] vi = 1 for child in forest_from_text_helper(tokiter, memo, want_item=True, delete_words=delete_words): if isinstance(child, Item): ants.append(child) rhs.append(sym.setindex(dummylabel, vi)) vi += 1 else: rhs.append(child) r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs)) node = Deduction(ants=ants, rule=r, dcost=dcost) if want_item: # need to insert OR node node = Item(dummylabel, dummyi, dummyj, deds=[node]) if nodeid: memo[nodeid] = node yield node elif toktype == 'term': terminal = tok[1] if terminal not in delete_words: yield sym.fromstring(terminal) elif toktype == 'ref': yield memo[tok[1]] elif toktype == 'pop': return else: raise TreeFormatException("unknown token %s" % (tok, ))