def seed(self, input, grammars, models, weights): fwords = [sym.fromstring(f) for f in input.fwords] self.models = models self.weights = weights # Seed the dotchart. This will give the extracted rules self.grammars = [(g, DotChart(self, fwords)) for g in grammars if isinstance(g, Grammar)] for (g,dotchart) in self.grammars: for i in xrange(self.n): if g.filterspan(i,i,self.n): dotchart.add(g.root,i,i,()) self.dot_added += 1 for g in grammars: if isinstance(g, NewGrammar): g.input(input) for i in xrange(self.n): for j in xrange(i+1,self.n+1): for (r,) in g.get_rules(i,j): estimate_rule(r, models, weights) self.add_axiom(i, j, r) # Last resort for unknown French word: pass it through for i in xrange(0, len(fwords)): for x in self.default_nonterminals: r = rule.Rule(x, rule.Phrase(fwords[i:i+1]), rule.Phrase(fwords[i:i+1]), scores=svector.Vector('unknown', 1.)) estimate_rule(r, models, weights) self.add_axiom(i, i+1, r)
def seed(self, flattice, grammars, models, weights): self.models = models self.weights = weights # Seed the dotchart. This will give the extracted rules self.grammars = [(g, DotChart(self, flattice)) for g in grammars if isinstance(g, Grammar)] for (g,dotchart) in self.grammars: for i in xrange(self.flattice.n-1): if g.filterspan(self.flattice, i, i): dotchart.add(g.root,i,i,()) self.dot_added += 1 for g in grammars: if isinstance(g, NewGrammar): g.input(flattice) for i in xrange(self.flattice.n-1): for j in xrange(i+1,self.flattice.n): for (r,) in g.get_rules(i,j): estimate_rule(r, models, weights) self.add_axiom(i, j, r) # Last resort for unknown French word: pass it through for edge in flattice.edges: for x in self.default_nonterminals: r = rule.Rule(x, rule.Phrase([edge.w]), rule.Phrase([edge.w]), scores=svector.Vector('unknown', 1.)) estimate_rule(r, models, weights) self.add_axiom(edge.i, edge.j, r)
def input(self, input): self.rules = collections.defaultdict(list) for tag, attrs, i, j in input.fmeta: attrs = sgml.attrs_to_dict(attrs) if attrs.has_key('english'): ephrases = attrs['english'].split('|') if attrs.has_key('cost'): costs = [float(x) for x in attrs['cost'].split('|')] elif attrs.has_key('prob'): costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')] else: costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform if len(costs) != len(ephrases): sys.stderr.write("wrong number of probabilities/costs") raise ValueError if attrs.has_key('features'): features = attrs['features'].split('|') if len(features) != len(ephrases): sys.stderr.write("wrong number of feature names") raise ValueError elif attrs.has_key('feature'): features = [attrs['feature'] for ephrase in ephrases] else: features = ['sgml' for ephrase in ephrases] if attrs.has_key('label'): tags = attrs['label'].split('|') else: tags = [tag.upper()] # bug: if new nonterminals are introduced at this point, # they will not participate in the topological sort for (ephrase,cost,feature) in zip(ephrases,costs,features): for tag in tags: r = rule.Rule(sym.fromtag(tag), rule.Phrase(input.fwords[i:j]), rule.Phrase([sym.fromstring(e) for e in ephrase.split()]), scores=svector.Vector('%s' % feature, cost)) self.rules[i,j].append((r,))
def input(self, lat): self.rules = collections.defaultdict(list) for span in lat.spans: i, j = span.i, span.j if hasattr(span, 'v'): v = svector.Vector(span.v) else: v = model.zero # bug: if new nonterminals are introduced at this point, # they will not participate in the topological sort r = rule.Rule(sym.fromtag(span.x), rule.Phrase([sym.fromstring(f) for f in span.f]), rule.Phrase([sym.fromstring(e) for e in span.e]), scores=v) self.rules[i,j].append((r,)) if log.level >= 2: log.write("added lattice rule at (%d,%d): %s\n" % (i,j,r))
def make_forest(fieldss): nodes = {} goal_ids = set() for fields in fieldss: node_id = fields['hyp'] if node_id not in nodes: nodes[node_id] = forest.Item(sym.fromtag('PHRASE'), 0, 0, []) node = nodes[node_id] if node_id == 0: r = rule.Rule(sym.fromtag('PHRASE'), rule.Phrase([]), rule.Phrase([])) node.deds.append(forest.Deduction((), r, svector.Vector())) else: m = scores_re.match(fields['scores']) core_values = [float(x) for x in m.group(1).split(',')] dcost = svector.Vector(m.group(2).encode('utf8')) for i, x in enumerate(core_values): dcost["_core%d" % i] = x back = int(fields['back']) ant = nodes[back] f = fields['src-phrase'].encode('utf8').split() e = fields['tgt-phrase'].encode('utf8').split() if len(f) != int(fields['cover-end']) - int(fields['cover-start']) + 1: sys.stderr.write("warning: French phrase length didn't match covered length\n") f = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + f) e = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + e) r = rule.Rule(sym.fromtag('PHRASE'), f, e) ded = forest.Deduction((ant,), r, dcost) node.deds.append(ded) if int(fields['forward']) < 0: # goal goal_ids.add(node_id) goal = forest.Item(None, 0, 0, []) for node_id in goal_ids: goal.deds.append(forest.Deduction((nodes[node_id],), None, svector.Vector())) return goal
def output(f): deriv = f.viterbi_deriv() hypv = deriv.vector() hyp = deriv.english() return "hyp={{{%s}}} derivation={{{%s}}} %s" % (" ".join(sym.tostring(e) for e in hyp), deriv, hypv) for srcline, forestline, reflines in itertools.izip(srcfile, forestfile, itertools.izip(*reffiles)): f = forest.forest_from_text(forestline) # the oracle needs to know how long all the French spans are for item in f.bottomup(): for ded in item.deds: # replace rule's French side with correct number of French words # we don't even bother to use the right number of variables ded.rule = rule.Rule(ded.rule.lhs, rule.Phrase([sym.fromstring('<foreign-word>')]*int(ded.dcost['foreign-length'])), ded.rule.e) f.reweight(weights) print "1-best %s" % output(f) s = sgml.Sentence(srcline.split()) s.fwords = srcline.split() s.refs = [refline.split() for refline in reflines] theoracle.input(s, verbose=False) oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector oracleweights *= -1 oracleweights += weights
for i in xrange(elen): if ewords[i] is not None: if type(ewords[i]) is tuple: (v, ei, ej) = ewords[i] # force slash categories to be at left edge of English side if force_english_prefix and len( new_ewords) != 0 and sym.clearindex( v) in prefix_labels: return None new_ewords.append(v) epos.append((ei, ej)) else: new_ewords.append(ewords[i]) epos.append(i + j1) r = XRule(x, rule.Phrase(tuple(fwords)), rule.Phrase(tuple(new_ewords))) r.fpos = fpos r.epos = epos r.span = (i1, i2, j1, j2) if opts.keep_word_alignments: r.word_alignments = [] for fi in xrange(len(fpos)): if type(fpos[fi]) is int: for ei in xrange(len(epos)): if type(epos[ei]) is int: if a.aligned[fpos[fi]][epos[ei]]: r.word_alignments.append((fi, ei)) return r
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]): """Currently this assumes that the only frontier nodes in the tree are words.""" while True: try: tok = tokiter.next() toktype = tok[0] except StopIteration: raise TreeFormatException("incomplete tree") if toktype == "or": _, nodeid = tok deds = list( forest_from_text_helper(tokiter, memo, delete_words=delete_words)) node = Item(dummylabel, dummyi, dummyj, deds=deds) if nodeid: memo[nodeid] = node yield node elif toktype == "nonterm": _, nodeid, ruleid, dcoststr = tok if ruleid == "": ruleid = dummylabel else: ruleid = rule.Nonterminal(ruleid) dcost = svector.Vector() if dcoststr: for fv in dcoststr.split(','): f, v = fv.split(':', 1) v = float(v) dcost[f] = v ants = [] rhs = [] vi = 1 for child in forest_from_text_helper(tokiter, memo, want_item=True, delete_words=delete_words): if isinstance(child, Item): ants.append(child) rhs.append(dummylabel.setindex(vi)) vi += 1 else: rhs.append(child) r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs)) node = Deduction(ants=ants, rule=r, dcost=dcost) if want_item: # need to insert OR node node = Item(dummylabel, dummyi, dummyj, deds=[node]) if nodeid: memo[nodeid] = node yield node elif toktype == 'term': terminal = tok[1] if terminal not in delete_words: yield terminal elif toktype == 'ref': yield memo[tok[1]] elif toktype == 'pop': return else: raise TreeFormatException("unknown token %s" % (tok, ))
def translate(self, input): """input: any object that has an attribute 'words' which is a list of numberized French words. and an 'id' attribute. and an 'instruction' attribute output: a forest""" if self.decoder_age >= 100: self.start_decoder() restarts = 0 self.decoder_age += 1 outforest = "" while restarts <= 3: try: self.send_weights(input=input) outforest = self.instruct(input) if outforest == "" or not self.create_forest( outforest) or self.child.poll() is not None: continue else: break # graehl->pust: careful - restarts += 1 doesn't happen on continue. infinite loop possible if decoder really outputs no forest (I think you changed it so a dummy forest is output, so this may be what you want? just bad for error reporting if you hang forever) except: lastexcept = log.strexcept(True) log.writeln("CAUGHT exception: %s" % lastexcept) pass restarts += 1 if restarts <= 3: log.writeln("restarting decoder") self.start_decoder() else: self.start_decoder() #raise Exception("too many decoder restarts for %s, giving up - last was: %s"%(input,lastexcept)) #don't raise because of global 100-retries limit in trainer.py log.write( "too many decoder restarts, giving up on exception %s:\n%s\nwith weights:\n%s\n" % (lastexcept, repr(input), self.weights)) self.create_forest("(0<noparse:1> )") # self.send_instruction('weights diff "%s";' % weightstring, input) # self.oldweights = svector.Vector(self.weights) # self.send_instruction(input.instruction,input) # outforest = self.child.recvline() # restarts = 0 # while outforest == "" or self.child.poll() is not None: # log.writeln("restarting decoder") # self.start_decoder() # if restarts > 3: # raise Exception("too many decoder restarts, giving up") # self.send_instruction('weights "%s";' % weightstring, input) # self.send_instruction(input.instruction, input) # outforest = self.child.recvline() # restarts += 1 log.writeln("received forest: %s...%s for %s" % (outforest[:80], outforest[-80:], input)) #sys.stderr.write("received forest: %s\n" % (outforest,)) # try: # f = forest.forest_from_text(outforest, delete_words=['@UNKNOWN@']) # except forest.TreeFormatException: # badforestf='%s/badforest.%s'%(badforestdir,input.id) # log.write("dumping bad forest to %s\n" % (badforestf,)) # forestfile = file(badforestf, "w") # forestfile.write(outforest) # forestfile.close() # raise f = self.forest self.forest = None #sys.stderr.write("internal forest: %s\n" % (forest.forest_to_text(f, mode='english'))) for item in f.bottomup(): for ded in item.deds: # replace rule's French side with correct number of French words # we don't even bother to use the right number of variables ded.rule = rule.Rule( ded.rule.lhs, rule.Phrase([sym.fromstring('<foreign-word>')] * int(ded.dcost['foreign-length'])), ded.rule.e) for feature in delete_features: del ded.dcost[feature] f.reweight( self.weights) # because forest_from_text doesn't compute viterbi return f