def reader(file, transpose=False): while True: try: comment = file.next().rstrip() ewords = [sym.fromstring(word) for word in file.next().split()] fline = file.next() fxwords = Alignment.eline_re.findall(fline) except StopIteration: return (fword, eindices) = fxwords[0] if fword == "NULL": fxwords = fxwords[1:] fxwords = [(sym.fromstring(fword), eindices) for (fword, eindices) in fxwords] fwords = [fword for (fword, eindices) in fxwords] if not transpose: a = Alignment(fwords, ewords, comment) else: a = Alignment(ewords, fwords, comment) for i in xrange(len(fxwords)): (fword, eindices) = fxwords[i] for eindex in eindices.split(): j = int(eindex)-1 if not transpose: a.align(i,j) else: a.align(j,i) yield a
def reader_pharaoh(ffile, efile, afile): progress = 0 for (fline, eline, aline) in itertools.izip(ffile, efile, afile): progress += 1 fwords = [sym.fromstring(w) for w in fline.split()] ewords = [sym.fromstring(w) for w in eline.split()] a = Alignment(fwords, ewords) for (i,j) in Alignment.aline_re.findall(aline): i = int(i) j = int(j) if i >= len(fwords) or j >= len(ewords): sys.stderr.write("warning, line %d: alignment point (%s,%s) out of bounds (%s,%s)\n" % (progress, i,j,len(fwords),len(ewords))) continue a.align(i,j) yield a
def seed(self, input, grammars, models, weights): fwords = [sym.fromstring(f) for f in input.fwords] self.models = models self.weights = weights # Seed the dotchart. This will give the extracted rules self.grammars = [(g, DotChart(self, fwords)) for g in grammars if isinstance(g, Grammar)] for (g,dotchart) in self.grammars: for i in xrange(self.n): if g.filterspan(i,i,self.n): dotchart.add(g.root,i,i,()) self.dot_added += 1 for g in grammars: if isinstance(g, NewGrammar): g.input(input) for i in xrange(self.n): for j in xrange(i+1,self.n+1): for (r,) in g.get_rules(i,j): estimate_rule(r, models, weights) self.add_axiom(i, j, r) # Last resort for unknown French word: pass it through for i in xrange(0, len(fwords)): for x in self.default_nonterminals: r = rule.Rule(x, rule.Phrase(fwords[i:i+1]), rule.Phrase(fwords[i:i+1]), scores=svector.Vector('unknown', 1.)) estimate_rule(r, models, weights) self.add_axiom(i, i+1, r)
def _tree_helper(t, antvalues): t = tree.str_to_tree(t) for node in t.frontier(): x = sym.fromstring(node.label) if sym.isvar(x): node.insert_child(0, antvalues[sym.getindex(x) - 1]) return t
def _tree_helper(t, antvalues): t = tree.str_to_tree(t) for node in t.frontier(): x = sym.fromstring(node.label) if sym.isvar(x): node.insert_child(0, antvalues[sym.getindex(x)-1]) return t
def __init__(self, x, i, j, deds=None, states=None, viterbi=None): if type(x) is str: x = sym.fromstring(x) self.x = x self.i = i self.j = j self.deds = deds if deds is not None else [] self.states = states self.viterbi = viterbi
def __init__(self, file): self.parses = {} for line in file: t = tree.str_to_tree(line) if t is None: log.write("warning: couldn't read tree\n") continue s = tuple([sym.fromstring(node.label) for node in t.frontier()]) self.parses[s] = t
def prepare_lattice(lat): if hasattr(lat, "prepared") and lat.prepared: return lat lat.prepared = True # create index by j lat.edge_index = collections.defaultdict(list) for edge in lat.edges: lat.edge_index[edge.j].append(edge) # numberize all the words for edge in lat.edges: edge.w = sym.fromstring(edge.w) lat.words = [sym.fromstring(w) for w in lat.words] # cache all-pairs shortest paths lat.distance = lat.compute_distance() return lat
def input(self, sent): self.refngrams = collections.defaultdict(int) for ref in sent.refs: ewords = [sym.fromstring(e) for e in ref] ngrams = collections.defaultdict(int) for o in xrange(self.order): for i in xrange(len(ewords)-o): ngrams[tuple(ewords[i:i+o+1])] += 1 for ngram in ngrams: self.refngrams[ngram] = max(self.refngrams[ngram], ngrams[ngram])
def input(self, lat): self.rules = collections.defaultdict(list) for span in lat.spans: i, j = span.i, span.j if hasattr(span, 'v'): v = svector.Vector(span.v) else: v = model.zero # bug: if new nonterminals are introduced at this point, # they will not participate in the topological sort r = rule.Rule(sym.fromtag(span.x), rule.Phrase([sym.fromstring(f) for f in span.f]), rule.Phrase([sym.fromstring(e) for e in span.e]), scores=v) self.rules[i,j].append((r,)) if log.level >= 2: log.write("added lattice rule at (%d,%d): %s\n" % (i,j,r))
def input(self, sent): self.refngrams = collections.defaultdict(int) for ref in sent.refs: ewords = [sym.fromstring(e) for e in ref] ngrams = collections.defaultdict(int) for o in xrange(self.order): for i in xrange(len(ewords) - o): ngrams[tuple(ewords[i:i + o + 1])] += 1 for ngram in ngrams: self.refngrams[ngram] = max(self.refngrams[ngram], ngrams[ngram])
def read_weightfile(f, threshold=None): w = {} progress = 0 log.write("Reading ttable from %s..." % f.name) for line in f: progress += 1 if progress % 100000 == 0: log.write(".") (word1, word2, p) = line.split() p = float(p) if threshold is not None and p < threshold: continue if word1 == "NULL": word1 = None else: word1 = sym.fromstring(word1) if word2 == "NULL": word2 = None else: word2 = sym.fromstring(word2) w.setdefault(word1, {}).setdefault(word2, p) log.write("done\n") return w
def read_weightfile(f, threshold=None): w = {} progress = 0 log.write("Reading ttable from %s..." % f.name) for line in f: progress += 1 if progress % 100000 == 0: log.write(".") (word1, word2, p) = line.split() p = float(p) if threshold is not None and p < threshold: continue if word1 == "NULL": word1 = None else: word1 = sym.fromstring(word1) if word2 == "NULL": word2 = None else: word2 = sym.fromstring(word2) w.setdefault(word1,{}).setdefault(word2, p) log.write("done\n") return w
def input(self, fwords, meta): self.ewords = [{} for f in fwords] for (tag, attrs, i, j) in meta: attrs = sgml.attrs_to_dict(attrs) if attrs.has_key('eword'): if j-i != 1: log.write("warning: eword attribute given for multi-word French expression") ewords = [sym.fromstring(e.strip()) for e in attrs['eword'].split('|')] if 'cost' in attrs: costs = [float(x) for x in attrs['cost'].split('|')] elif 'prob' in attrs: costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')] else: costs = [-math.log10(1.0/len(ewords)) for e in ewords] self.ewords[i] = dict(zip(ewords,costs))
def __init__(self, sent): object.__init__(self) if (len(sent.words) == 0): self.columns = () return # empty line, it happens line = sent.words[0] if (line.startswith("(((")): if (len(sent.words) > 1): log.write("Bad sentence: %s\n" % (line)) assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets! line = "((('<s>',1.0,1),),"+line[1:len(line)-1]+"(('</s>',1.0,1),))" cols = eval(line) res = [] for col in cols: x = [] for alt in col: costs = alt[1] if (type(costs) != type((1,2))): costs=(float(costs),) j=[] for c in costs: j.append(float(c)) cost = tuple(j) spanlen = 1 if (len(alt) == 3): spanlen = alt[2] x.append((sym.fromstring(alt[0],terminal=True), None, spanlen)) res.append(tuple(x)) self.columns = tuple(res) else: # convert a string of input into a CN res = []; res.append(((sym.fromstring('<s>',terminal=True), None, 1), )) for word in sent.words: res.append(((sym.fromstring(word,terminal=True), None, 1), )); # (alt=word, cost=0.0) res.append(((sym.fromstring('</s>',terminal=True), None, 1), )) self.columns = tuple(res)
def input(self, input): self.rules = collections.defaultdict(list) for tag, attrs, i, j in input.fmeta: attrs = sgml.attrs_to_dict(attrs) if attrs.has_key('english'): ephrases = attrs['english'].split('|') if attrs.has_key('cost'): costs = [float(x) for x in attrs['cost'].split('|')] elif attrs.has_key('prob'): costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')] else: costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform if len(costs) != len(ephrases): sys.stderr.write("wrong number of probabilities/costs") raise ValueError if attrs.has_key('features'): features = attrs['features'].split('|') if len(features) != len(ephrases): sys.stderr.write("wrong number of feature names") raise ValueError elif attrs.has_key('feature'): features = [attrs['feature'] for ephrase in ephrases] else: features = ['sgml' for ephrase in ephrases] if attrs.has_key('label'): tags = attrs['label'].split('|') else: tags = [tag.upper()] # bug: if new nonterminals are introduced at this point, # they will not participate in the topological sort for (ephrase,cost,feature) in zip(ephrases,costs,features): for tag in tags: r = rule.Rule(sym.fromtag(tag), rule.Phrase(input.fwords[i:j]), rule.Phrase([sym.fromstring(e) for e in ephrase.split()]), scores=svector.Vector('%s' % feature, cost)) self.rules[i,j].append((r,))
def finish(self, v, words): """Return a copy of v that contains only the features relevant to computing a score. We can also perform any necessary corrections to v that are possible knowing the whole output.""" # Actually, for BLEU we just recompute from scratch # postprocessing: delete non-ASCII chars and @UNKNOWN@ words = [sym.tostring(w) for w in words] words = " ".join(words) words = "".join(c for c in words if ord(c) < 128) words = [sym.fromstring(word) for word in words.split()] v = svector.Vector() cand = collections.defaultdict(int) for o in xrange(self.order): for i in xrange(len(words) - o): cand[tuple(words[i:i + o + 1])] += 1 match = collections.defaultdict(int) for ngram in cand: match[len(ngram) - 1] += min(cand[ngram], self.oraclemodel.refngrams[ngram]) for o in xrange(self.order): v["oracle.match%d" % o] = match[o] v["oracle.guess%d" % o] = max(0, len(words) - o) v["oracle.srclen"] = self.wordcounter.srclen v["oracle.candlen"] = len(words) if self.variant == "ibm": v["oracle.reflen"] = min( (abs(l - len(words)), l) for l in self.wordcounter.reflens)[1] else: v["oracle.reflen"] = self.wordcounter.reflen return v
def input(self, fwords, meta): self.ewords = [{} for f in fwords] for (tag, attrs, i, j) in meta: attrs = sgml.attrs_to_dict(attrs) if attrs.has_key('eword'): if j - i != 1: log.write( "warning: eword attribute given for multi-word French expression" ) ewords = [ sym.fromstring(e.strip()) for e in attrs['eword'].split('|') ] if 'cost' in attrs: costs = [float(x) for x in attrs['cost'].split('|')] elif 'prob' in attrs: costs = [ -math.log10(float(x)) for x in attrs['prob'].split('|') ] else: costs = [-math.log10(1.0 / len(ewords)) for e in ewords] self.ewords[i] = dict(zip(ewords, costs))
def finish(self, v, words): """Return a copy of v that contains only the features relevant to computing a score. We can also perform any necessary corrections to v that are possible knowing the whole output.""" # Actually, for BLEU we just recompute from scratch # postprocessing: delete non-ASCII chars and @UNKNOWN@ words = [sym.tostring(w) for w in words] words = " ".join(words) words = "".join(c for c in words if ord(c) < 128) words = [sym.fromstring(word) for word in words.split()] v = svector.Vector() cand = collections.defaultdict(int) for o in xrange(self.order): for i in xrange(len(words)-o): cand[tuple(words[i:i+o+1])] += 1 match = collections.defaultdict(int) for ngram in cand: match[len(ngram)-1] += min(cand[ngram], self.oraclemodel.refngrams[ngram]) for o in xrange(self.order): v["oracle.match%d" % o] = match[o] v["oracle.guess%d" % o] = max(0,len(words)-o) v["oracle.srclen"] = self.wordcounter.srclen v["oracle.candlen"] = len(words) if self.variant == "ibm": v["oracle.reflen"] = min((abs(l-len(words)), l) for l in self.wordcounter.reflens)[1] else: v["oracle.reflen"] = self.wordcounter.reflen return v
def output(f): deriv = f.viterbi_deriv() hypv = deriv.vector() hyp = deriv.english() return "hyp={{{%s}}} derivation={{{%s}}} %s" % (" ".join(sym.tostring(e) for e in hyp), deriv, hypv) for srcline, forestline, reflines in itertools.izip(srcfile, forestfile, itertools.izip(*reffiles)): f = forest.forest_from_text(forestline) # the oracle needs to know how long all the French spans are for item in f.bottomup(): for ded in item.deds: # replace rule's French side with correct number of French words # we don't even bother to use the right number of variables ded.rule = rule.Rule(ded.rule.lhs, rule.Phrase([sym.fromstring('<foreign-word>')]*int(ded.dcost['foreign-length'])), ded.rule.e) f.reweight(weights) print "1-best %s" % output(f) s = sgml.Sentence(srcline.split()) s.fwords = srcline.split() s.refs = [refline.split() for refline in reflines] theoracle.input(s, verbose=False) oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector oracleweights *= -1 oracleweights += weights
- change notion of "tight" so that outer phrases are minimal and inner phrases are maximal? """ import sys, os, os.path import monitor import time, math import random import alignment, rule, forest import sym import log import cPickle log.level = 1 PHRASE = sym.fromstring('[PHRASE]') START = sym.fromstring('[START]') nonterminals = [PHRASE] class XRule(rule.Rule): def __init__(self, lhs, f, e, owner=None, scores=None): rule.Rule.__init__(self, lhs, f, e, owner=owner, scores=scores) self.fpos = self.epos = self.span = None class Feature(object): def __init__(self): object.__init__(self) def process_alignment(self, a):
if opts.weightfiles is not None: fweightfile = file(opts.weightfiles[0], "w") eweightfile = file(opts.weightfiles[1], "w") if opts.ratiofile is not None: ratiofile = file(opts.ratiofile, "w") fcount = {} ecount = {} fecount = {} count = 0 progress = 0 for a in alignments: null = sym.fromstring("NULL") # Calculate lexical weights for i in xrange(len(a.fwords)): for j in xrange(len(a.ewords)): if a.aligned[i][j]: count += 1 fcount[a.fwords[i]] = fcount.get(a.fwords[i],0)+1 ecount[a.ewords[j]] = ecount.get(a.ewords[j],0)+1 fecount[(a.fwords[i],a.ewords[j])] = fecount.get((a.fwords[i],a.ewords[j]),0)+1 for i in xrange(len(a.fwords)): if not a.faligned[i]: count += 1 fcount[a.fwords[i]] = fcount.get(a.fwords[i],0)+1 ecount[null] = ecount.get(null,0)+1 fecount[(a.fwords[i],null)] = fecount.get((a.fwords[i],null),0)+1
def dump(self, rules=None, sid=1, fsent="<foreign-sentence>", byline="", reflines=[]): nodememo = {} # to keep track of sizes (# of nodes, # of edges) # forest id, foreign sentence (TODO: refs) fsent = fsent.split(" ") s = "%s\t%s\n" % (sid, " ".join(fsent)) + \ "%d\n" % len(reflines) + \ "".join(reflines) flen = len(words_to_chars(fsent, encode_back=True)) fwlen = len(fsent) reversed_fsent = list(reversed(fsent)) ## RIGHT TO LEFT if byline != "": self.traverse(0, 0, reversed_fsent, rules, nodememo) ## swap back self.adjust_spans(flen, fwlen) byline = byline.split(" ") byline_flen = self.i byline_fwlen = self.wi byline_f = fsent[:byline_fwlen] print >> logs, "clen (non-byline) = %d (%d)" % (flen, self.j - self.i) print >> logs, "wlen (non-byline) = %d (%d)" % (fwlen, self.wj - self.wi) print >> logs, "BYLINE = " + " ".join(byline_f) + \ " ### %d chars, %d words" % (byline_flen, byline_fwlen) assert len(words_to_chars(byline_f)) == byline_flen, "@sentence %d, BYLINE Error" % opts.sentid ## check consistency ## new rule/edge ## TOP("by" "line" x0:TOP) -> "BY" "LINE" x0 ### id=-1 byline_e = " ".join('"%s"' % w for w in byline) lhs = "TOP(" + byline_e + " x0:%s)" % self.x # "TOP" rhs = " ".join('"%s"' % w for w in byline_f) + " x0" # byline rule, id=-1 rid = -1 rules[rid] = "%s -> %s ### id=%d" % (lhs, rhs, rid) ## make david-style LHS david_lhs = [] for w in byline: david_lhs.append(sym.fromstring(w)) david_lhs.append(sym.setindex(dummylabel, 1)) ded = Deduction([self], rule.Rule(rid, rule.Phrase(david_lhs), rule.Phrase(david_lhs)),\ svector.Vector()) ded.lhsstr = byline_e.split() + [self] ## N.B.: dont forget "..." ded.ruleid = rid # new node on top of TOP oldtop = self self = Item(self.x, 0, flen, deds=[ded]) self.x = oldtop.x self.wi = 0 self.wj = fwlen self.id = len(nodememo)+1 nodememo[id(self)] = (self.id, nodememo[id(oldtop)][1]+1) #edges else: # establish node spans self.traverse(0, 0, reversed_fsent, rules, nodememo) # swap i,j self.adjust_spans(flen, fwlen) ## lhuang: the following is from hope.py ## be very careful about weights interpolation sg = sgml.Sentence(fsent) sg.fwords = fsent sg.refs = [refline.split(" ") for refline in reflines] if sg.refs: theoracle.input(sg, verbose=False) # 1-best self.reweight(weights) output(self, "1-best @ %s" % sid, onebestbleus, onebestscores) base_oracleweights = theoracle.make_weights(additive=True) # we use the in-place operations because oracleweights might be # a subclass of Vector for relative in []:#[opts.hope]: oracleweights = theoracle.make_weights(additive=True) oracleweights *= relative # interpolation: taking modelcost into account oracleweights += weights # compute oracle self.rescore(theoracle.models, oracleweights, add=True) # TODO: why?? output(self, "hope%s " % relative, hopebleus[relative], hopescores[relative]) # right boundary should match sentence length (in chars) assert self.j == flen and self.wj == fwlen, \ "@sentence %d, Boundary Mismatch at %s\t%s" % (opts.sentid, sid, fsent) + \ "self.j=%d, flen=%d; self.wj=%d, fwlen=%d" % (self.j, flen, self.wj, fwlen) s += "%d\t%d\n" % nodememo[id(self)] + \ self._dump(rules, deriv=self.viterbi_deriv()) return s
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]): """Currently this assumes that the only frontier nodes in the tree are words.""" while True: try: tok = tokiter.next() toktype = tok[0] except StopIteration: raise TreeFormatException("incomplete tree") if toktype == "or": _, nodeid = tok deds = list(forest_from_text_helper(tokiter, memo, \ delete_words=delete_words)) node = Item(dummylabel, dummyi, dummyj, deds=deds) if nodeid: memo[nodeid] = node node.nodeid = nodeid yield node elif toktype == "nonterm": _, nodeid, ruleid, dcoststr = tok if ruleid == "": ruleid = dummylabel else: # lhuang: N.B.: sym.fromtag would re-alloc it xrs_ruleid = int(ruleid) ruleid = sym.fromtag(ruleid) #int(ruleid) # dcost = svector.Vector() if dcoststr: # lhuang: features are read from forest, not rules # so there is no "e^..." or "10^..." for fv in dcoststr.split(','): f,v = fv.split(':',1) v = float(v) dcost[f] = v ants = [] rhs = [] vi = 1 for child in forest_from_text_helper(tokiter, memo, want_item=True,\ delete_words=delete_words): if isinstance(child, Item): ants.append(child) rhs.append(sym.setindex(dummylabel, vi)) vi += 1 else: rhs.append(child) r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs)) node = Deduction(ants=ants, rule=r, dcost=dcost) node.ruleid = xrs_ruleid if want_item: # need to insert OR node node = Item(dummylabel, dummyi, dummyj, deds=[node]) if nodeid: memo[nodeid] = node yield node elif toktype == 'term': terminal = tok[1] if terminal not in delete_words: yield sym.fromstring(terminal) elif toktype == 'ref': yield memo[tok[1]] elif toktype == 'pop': return else: raise TreeFormatException("unknown token %s" % (tok,))
(abs(l - len(words)), l) for l in self.wordcounter.reflens)[1] else: v["oracle.reflen"] = self.wordcounter.reflen return v def clean(self, v): """Return a copy of v that doesn't have any of the features used for the oracle.""" v = svector.Vector(v) for f in self.feats: del v[f] return v HOLE = sym.fromstring("<elided>") def make_state(ewords, order): if order == 1: return (HOLE, ) elif len(ewords) < order: return ewords else: return ewords[:order - 1] + (HOLE, ) + ewords[-order + 1:] class OracleModel(model.Model): def __init__(self, order=4): model.Model.__init__(self) self.order = order
def get_cn(sentence): sentence = chain(('<s>',), sentence.split(), ('</s>',)) sentence = (sym.fromstring(word, terminal=True) for word in sentence) return tuple(((word, None, 1), ) for word in sentence)
def translate(self, input): """input: any object that has an attribute 'words' which is a list of numberized French words. and an 'id' attribute. and an 'instruction' attribute output: a forest""" if self.decoder_age >= 100: self.start_decoder() restarts = 0 self.decoder_age += 1 outforest = "" while restarts <= 3: try: self.send_weights(input=input) outforest = self.instruct(input) if outforest == "" or not self.create_forest( outforest) or self.child.poll() is not None: continue else: break # graehl->pust: careful - restarts += 1 doesn't happen on continue. infinite loop possible if decoder really outputs no forest (I think you changed it so a dummy forest is output, so this may be what you want? just bad for error reporting if you hang forever) except: lastexcept = log.strexcept(True) log.writeln("CAUGHT exception: %s" % lastexcept) pass restarts += 1 if restarts <= 3: log.writeln("restarting decoder") self.start_decoder() else: self.start_decoder() #raise Exception("too many decoder restarts for %s, giving up - last was: %s"%(input,lastexcept)) #don't raise because of global 100-retries limit in trainer.py log.write( "too many decoder restarts, giving up on exception %s:\n%s\nwith weights:\n%s\n" % (lastexcept, repr(input), self.weights)) self.create_forest("(0<noparse:1> )") # self.send_instruction('weights diff "%s";' % weightstring, input) # self.oldweights = svector.Vector(self.weights) # self.send_instruction(input.instruction,input) # outforest = self.child.recvline() # restarts = 0 # while outforest == "" or self.child.poll() is not None: # log.writeln("restarting decoder") # self.start_decoder() # if restarts > 3: # raise Exception("too many decoder restarts, giving up") # self.send_instruction('weights "%s";' % weightstring, input) # self.send_instruction(input.instruction, input) # outforest = self.child.recvline() # restarts += 1 log.writeln("received forest: %s...%s for %s" % (outforest[:80], outforest[-80:], input)) #sys.stderr.write("received forest: %s\n" % (outforest,)) # try: # f = forest.forest_from_text(outforest, delete_words=['@UNKNOWN@']) # except forest.TreeFormatException: # badforestf='%s/badforest.%s'%(badforestdir,input.id) # log.write("dumping bad forest to %s\n" % (badforestf,)) # forestfile = file(badforestf, "w") # forestfile.write(outforest) # forestfile.close() # raise f = self.forest self.forest = None #sys.stderr.write("internal forest: %s\n" % (forest.forest_to_text(f, mode='english'))) for item in f.bottomup(): for ded in item.deds: # replace rule's French side with correct number of French words # we don't even bother to use the right number of variables ded.rule = rule.Rule( ded.rule.lhs, rule.Phrase([sym.fromstring('<foreign-word>')] * int(ded.dcost['foreign-length'])), ded.rule.e) for feature in delete_features: del ded.dcost[feature] f.reweight( self.weights) # because forest_from_text doesn't compute viterbi return f
# cn.py # Chris Dyer <*****@*****.**> # Copyright (c) 2006 University of Maryland. # vim:tabstop=4:autoindent:expandtab import sys import math import sym import log import sgml epsilon = sym.fromstring('*EPS*'); class CNStats(object): def __init__(self): self.read = 0 self.colls = 0 self.words = 0 def collect(self, cn): self.read += 1 self.colls += cn.get_length() for col in cn.columns: self.words += len(col) def __str__(self): return "confusion net statistics:\n succ. read: %d\n columns: %d\n words: %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read)) class ConfusionNet(object): def __init__(self, sent):
def translate(self, input): """input: any object that has an attribute 'words' which is a list of numberized French words. and an 'id' attribute. and an 'instruction' attribute output: a forest""" if self.decoder_age >= 100: self.start_decoder() restarts = 0 self.decoder_age += 1 outforest="" while restarts <= 3: try: self.send_weights(input=input) outforest = self.instruct(input) if outforest == "" or not self.create_forest(outforest) or self.child.poll() is not None: continue else: break # graehl->pust: careful - restarts += 1 doesn't happen on continue. infinite loop possible if decoder really outputs no forest (I think you changed it so a dummy forest is output, so this may be what you want? just bad for error reporting if you hang forever) except: lastexcept=log.strexcept(True) log.writeln("CAUGHT exception: %s" % lastexcept) pass restarts += 1 if restarts <= 3: log.writeln("restarting decoder") self.start_decoder() else: self.start_decoder() #raise Exception("too many decoder restarts for %s, giving up - last was: %s"%(input,lastexcept)) #don't raise because of global 100-retries limit in trainer.py log.write("too many decoder restarts, giving up on exception %s:\n%s\nwith weights:\n%s\n" % (lastexcept,repr(input),self.weights)) self.create_forest("(0<noparse:1> )") # self.send_instruction('weights diff "%s";' % weightstring, input) # self.oldweights = svector.Vector(self.weights) # self.send_instruction(input.instruction,input) # outforest = self.child.recvline() # restarts = 0 # while outforest == "" or self.child.poll() is not None: # log.writeln("restarting decoder") # self.start_decoder() # if restarts > 3: # raise Exception("too many decoder restarts, giving up") # self.send_instruction('weights "%s";' % weightstring, input) # self.send_instruction(input.instruction, input) # outforest = self.child.recvline() # restarts += 1 log.writeln("received forest: %s...%s for %s" % (outforest[:80],outforest[-80:], input)) #sys.stderr.write("received forest: %s\n" % (outforest,)) # try: # f = forest.forest_from_text(outforest, delete_words=['@UNKNOWN@']) # except forest.TreeFormatException: # badforestf='%s/badforest.%s'%(badforestdir,input.id) # log.write("dumping bad forest to %s\n" % (badforestf,)) # forestfile = file(badforestf, "w") # forestfile.write(outforest) # forestfile.close() # raise f = self.forest self.forest = None #sys.stderr.write("internal forest: %s\n" % (forest.forest_to_text(f, mode='english'))) for item in f.bottomup(): for ded in item.deds: # replace rule's French side with correct number of French words # we don't even bother to use the right number of variables ded.rule = rule.Rule(ded.rule.lhs, rule.Phrase([sym.fromstring('<foreign-word>')]*int(ded.dcost['foreign-length'])), ded.rule.e) for feature in delete_features: del ded.dcost[feature] f.reweight(self.weights) # because forest_from_text doesn't compute viterbi return f
- change notion of "tight" so that outer phrases are minimal and inner phrases are maximal? """ import sys, os, os.path import monitor import time, math import random import alignment, rule, forest import sym import log import cPickle log.level = 1 PHRASE = sym.fromstring('[PHRASE]') START = sym.fromstring('[START]') nonterminals = [PHRASE] class XRule(rule.Rule): def __init__(self, lhs, f, e, owner=None, scores=None): rule.Rule.__init__(self, lhs, f, e, owner=owner, scores=scores) self.fpos = self.epos = self.span = None class Feature(object): def __init__(self): object.__init__(self) def process_alignment(self, a): pass
if opts.weightfiles is not None: fweightfile = file(opts.weightfiles[0], "w") eweightfile = file(opts.weightfiles[1], "w") if opts.ratiofile is not None: ratiofile = file(opts.ratiofile, "w") fcount = {} ecount = {} fecount = {} count = 0 progress = 0 for a in alignments: null = sym.fromstring("NULL") # Calculate lexical weights for i in xrange(len(a.fwords)): for j in xrange(len(a.ewords)): if a.aligned[i][j]: count += 1 fcount[a.fwords[i]] = fcount.get(a.fwords[i], 0) + 1 ecount[a.ewords[j]] = ecount.get(a.ewords[j], 0) + 1 fecount[(a.fwords[i], a.ewords[j])] = fecount.get( (a.fwords[i], a.ewords[j]), 0) + 1 for i in xrange(len(a.fwords)): if not a.faligned[i]: count += 1 fcount[a.fwords[i]] = fcount.get(a.fwords[i], 0) + 1 ecount[null] = ecount.get(null, 0) + 1
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]): """Currently this assumes that the only frontier nodes in the tree are words.""" while True: try: tok = tokiter.next() toktype = tok[0] except StopIteration: raise TreeFormatException("incomplete tree") if toktype == "or": _, nodeid = tok deds = list( forest_from_text_helper(tokiter, memo, delete_words=delete_words)) node = Item(dummylabel, dummyi, dummyj, deds=deds) if nodeid: memo[nodeid] = node yield node elif toktype == "nonterm": _, nodeid, ruleid, dcoststr = tok if ruleid == "": ruleid = dummylabel else: ruleid = sym.fromtag(ruleid) dcost = svector.Vector() if dcoststr: for fv in dcoststr.split(','): f, v = fv.split(':', 1) v = float(v) dcost[f] = v ants = [] rhs = [] vi = 1 for child in forest_from_text_helper(tokiter, memo, want_item=True, delete_words=delete_words): if isinstance(child, Item): ants.append(child) rhs.append(sym.setindex(dummylabel, vi)) vi += 1 else: rhs.append(child) r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs)) node = Deduction(ants=ants, rule=r, dcost=dcost) if want_item: # need to insert OR node node = Item(dummylabel, dummyi, dummyj, deds=[node]) if nodeid: memo[nodeid] = node yield node elif toktype == 'term': terminal = tok[1] if terminal not in delete_words: yield sym.fromstring(terminal) elif toktype == 'ref': yield memo[tok[1]] elif toktype == 'pop': return else: raise TreeFormatException("unknown token %s" % (tok, ))
# Copyright (c) 2004-2006 University of Maryland. All rights # reserved. Do not redistribute without permission from the # author. Not for commercial use. import sys, os, os.path import monitor import math import heapq import sym, rule, cost import log from filter import Filter log.level = 1 PHRASE = sym.fromstring('[PHRASE]') P_IMPROBABLE = 1e-7 # weights are recorded in our files to the 1e-6 place def costfromprob(p): try: return -math.log10(p) except (ValueError, OverflowError): return cost.IMPOSSIBLE profile = False if not profile: try:
# Copyright (c) 2004-2006 University of Maryland. All rights # reserved. Do not redistribute without permission from the # author. Not for commercial use. import sys, os, os.path import monitor import math import heapq import sym, rule, cost import log from filter import Filter log.level = 1 PHRASE = sym.fromstring('[PHRASE]') P_IMPROBABLE = 1e-7 # weights are recorded in our files to the 1e-6 place def costfromprob(p): try: return -math.log10(p) except (ValueError, OverflowError): return cost.IMPOSSIBLE profile = False if not profile: try: import psyco psyco.profile()
if self.variant == "ibm": v["oracle.reflen"] = min((abs(l-len(words)), l) for l in self.wordcounter.reflens)[1] else: v["oracle.reflen"] = self.wordcounter.reflen return v def clean(self, v): """Return a copy of v that doesn't have any of the features used for the oracle.""" v = svector.Vector(v) for f in self.feats: del v[f] return v HOLE = sym.fromstring("<elided>") def make_state(ewords, order): if order == 1: return (HOLE,) elif len(ewords) < order: return ewords else: return ewords[:order-1] + (HOLE,) + ewords[-order+1:] class OracleModel(model.Model): def __init__(self, order=4): model.Model.__init__(self) self.order = order self.feat = ["oracle.match%d" % o for o in xrange(order)]