Exemple #1
0
    def reader(file, transpose=False):
        while True:
            try:
                comment = file.next().rstrip()
                ewords = [sym.fromstring(word) for word in file.next().split()]
                fline = file.next()
                fxwords = Alignment.eline_re.findall(fline)
            except StopIteration:
                return

            (fword, eindices) = fxwords[0]
            if fword == "NULL":
                fxwords = fxwords[1:]
            fxwords = [(sym.fromstring(fword), eindices) for (fword, eindices) in fxwords]
            fwords = [fword for (fword, eindices) in fxwords]

            if not transpose:
                a = Alignment(fwords, ewords, comment)
            else:
                a = Alignment(ewords, fwords, comment)

            for i in xrange(len(fxwords)):
                (fword, eindices) = fxwords[i]
                for eindex in eindices.split():
                    j = int(eindex)-1
                    if not transpose:
                        a.align(i,j)
                    else:
                        a.align(j,i)

            yield a
Exemple #2
0
 def reader_pharaoh(ffile, efile, afile):
     progress = 0
     for (fline, eline, aline) in itertools.izip(ffile, efile, afile):
         progress += 1
         fwords = [sym.fromstring(w) for w in fline.split()]
         ewords = [sym.fromstring(w) for w in eline.split()]
         a = Alignment(fwords, ewords)
         for (i,j) in Alignment.aline_re.findall(aline):
             i = int(i)
             j = int(j)
             if i >= len(fwords) or j >= len(ewords):
                 sys.stderr.write("warning, line %d: alignment point (%s,%s) out of bounds (%s,%s)\n" % (progress, i,j,len(fwords),len(ewords)))
                 continue
             a.align(i,j)
         yield a
Exemple #3
0
    def seed(self, input, grammars, models, weights):
        fwords = [sym.fromstring(f) for f in input.fwords]
        self.models = models
        self.weights = weights

        # Seed the dotchart. This will give the extracted rules

        self.grammars = [(g, DotChart(self, fwords)) for g in grammars if isinstance(g, Grammar)]

        for (g,dotchart) in self.grammars:
            for i in xrange(self.n):
                if g.filterspan(i,i,self.n):
                    dotchart.add(g.root,i,i,())
                    self.dot_added += 1

        for g in grammars:
            if isinstance(g, NewGrammar):
                g.input(input)
                for i in xrange(self.n):
                    for j in xrange(i+1,self.n+1):
                        for (r,) in g.get_rules(i,j):
                            estimate_rule(r, models, weights)
                            self.add_axiom(i, j, r)

        # Last resort for unknown French word: pass it through
        for i in xrange(0, len(fwords)):
            for x in self.default_nonterminals:
                r = rule.Rule(x,
                              rule.Phrase(fwords[i:i+1]),
                              rule.Phrase(fwords[i:i+1]),
                              scores=svector.Vector('unknown', 1.))
                estimate_rule(r, models, weights)
                self.add_axiom(i, i+1, r)
Exemple #4
0
 def _tree_helper(t, antvalues):
     t = tree.str_to_tree(t)
     for node in t.frontier():
         x = sym.fromstring(node.label)
         if sym.isvar(x):
             node.insert_child(0, antvalues[sym.getindex(x) - 1])
     return t
Exemple #5
0
 def _tree_helper(t, antvalues):
     t = tree.str_to_tree(t)
     for node in t.frontier():
         x = sym.fromstring(node.label)
         if sym.isvar(x):
             node.insert_child(0, antvalues[sym.getindex(x)-1])
     return t
Exemple #6
0
 def __init__(self, x, i, j, deds=None, states=None, viterbi=None):
     if type(x) is str:
         x = sym.fromstring(x)
     self.x = x
     self.i = i
     self.j = j
     self.deds = deds if deds is not None else []
     self.states = states
     self.viterbi = viterbi
Exemple #7
0
 def __init__(self, x, i, j, deds=None, states=None, viterbi=None):
     if type(x) is str:
         x = sym.fromstring(x)
     self.x = x
     self.i = i
     self.j = j
     self.deds = deds if deds is not None else []
     self.states = states
     self.viterbi = viterbi
Exemple #8
0
 def __init__(self, file):
     self.parses = {}
     for line in file:
         t = tree.str_to_tree(line)
         if t is None:
             log.write("warning: couldn't read tree\n")
             continue
         s = tuple([sym.fromstring(node.label) for node in t.frontier()])
         self.parses[s] = t
Exemple #9
0
def prepare_lattice(lat):
    if hasattr(lat, "prepared") and lat.prepared:
        return lat
    lat.prepared = True

    # create index by j
    lat.edge_index = collections.defaultdict(list)
    for edge in lat.edges:
        lat.edge_index[edge.j].append(edge)

    # numberize all the words
    for edge in lat.edges:
        edge.w = sym.fromstring(edge.w)
    lat.words = [sym.fromstring(w) for w in lat.words]

    # cache all-pairs shortest paths
    lat.distance = lat.compute_distance()

    return lat
Exemple #10
0
 def input(self, sent):
     self.refngrams = collections.defaultdict(int)
     for ref in sent.refs:
         ewords = [sym.fromstring(e) for e in ref]
         ngrams = collections.defaultdict(int)
         for o in xrange(self.order):
             for i in xrange(len(ewords)-o):
                 ngrams[tuple(ewords[i:i+o+1])] += 1
         for ngram in ngrams:
             self.refngrams[ngram] = max(self.refngrams[ngram], ngrams[ngram])
Exemple #11
0
    def input(self, lat):
        self.rules = collections.defaultdict(list)
        for span in lat.spans:
            i, j = span.i, span.j

            if hasattr(span, 'v'):
                v = svector.Vector(span.v)
            else:
                v = model.zero

            # bug: if new nonterminals are introduced at this point,
            # they will not participate in the topological sort

            r = rule.Rule(sym.fromtag(span.x),
                          rule.Phrase([sym.fromstring(f) for f in span.f]),
                          rule.Phrase([sym.fromstring(e) for e in span.e]),
                          scores=v)
            self.rules[i,j].append((r,))
            if log.level >= 2:
                log.write("added lattice rule at (%d,%d): %s\n" % (i,j,r))
Exemple #12
0
 def input(self, sent):
     self.refngrams = collections.defaultdict(int)
     for ref in sent.refs:
         ewords = [sym.fromstring(e) for e in ref]
         ngrams = collections.defaultdict(int)
         for o in xrange(self.order):
             for i in xrange(len(ewords) - o):
                 ngrams[tuple(ewords[i:i + o + 1])] += 1
         for ngram in ngrams:
             self.refngrams[ngram] = max(self.refngrams[ngram],
                                         ngrams[ngram])
Exemple #13
0
def read_weightfile(f, threshold=None):
    w = {}
    progress = 0
    log.write("Reading ttable from %s..." % f.name)
    for line in f:
        progress += 1
        if progress % 100000 == 0:
            log.write(".")
        (word1, word2, p) = line.split()
        p = float(p)
        if threshold is not None and p < threshold:
            continue
        if word1 == "NULL":
            word1 = None
        else:
            word1 = sym.fromstring(word1)
        if word2 == "NULL":
            word2 = None
        else:
            word2 = sym.fromstring(word2)
        w.setdefault(word1, {}).setdefault(word2, p)
    log.write("done\n")
    return w
def read_weightfile(f, threshold=None):
    w = {}
    progress = 0
    log.write("Reading ttable from %s..." % f.name)
    for line in f:
        progress += 1
        if progress % 100000 == 0:
            log.write(".")
        (word1, word2, p) = line.split()
        p = float(p)
        if threshold is not None and p < threshold:
            continue
        if word1 == "NULL":
            word1 = None
        else:
            word1 = sym.fromstring(word1)
        if word2 == "NULL":
            word2 = None
        else:
            word2 = sym.fromstring(word2)
        w.setdefault(word1,{}).setdefault(word2, p)
    log.write("done\n")
    return w
Exemple #15
0
 def input(self, fwords, meta):
     self.ewords = [{} for f in fwords]
     for (tag, attrs, i, j) in meta:
         attrs = sgml.attrs_to_dict(attrs)
         if attrs.has_key('eword'):
             if j-i != 1:
                 log.write("warning: eword attribute given for multi-word French expression")
             ewords = [sym.fromstring(e.strip()) for e in attrs['eword'].split('|')]
             if 'cost' in attrs:
                 costs = [float(x) for x in attrs['cost'].split('|')]
             elif 'prob' in attrs:
                 costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')]
             else:
                 costs = [-math.log10(1.0/len(ewords)) for e in ewords]
             self.ewords[i] = dict(zip(ewords,costs))
Exemple #16
0
 def __init__(self, sent):
     object.__init__(self)
     if (len(sent.words) == 0):
         self.columns = ()
         return # empty line, it happens
     line = sent.words[0]
     if (line.startswith("(((")):
         if (len(sent.words) > 1):
             log.write("Bad sentence: %s\n" % (line))
         assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets!
         line =  "((('<s>',1.0,1),),"+line[1:len(line)-1]+"(('</s>',1.0,1),))"
         cols = eval(line)
         res = []
         for col in cols:
            x = []
            for alt in col:
                costs = alt[1]
                if (type(costs) != type((1,2))):
                    costs=(float(costs),)
                j=[]
                for c in costs:
                    j.append(float(c))
                cost = tuple(j)
                spanlen = 1
                if (len(alt) == 3):
                    spanlen = alt[2]
                x.append((sym.fromstring(alt[0],terminal=True), None, spanlen))
            res.append(tuple(x))
         self.columns = tuple(res)
     else:  # convert a string of input into a CN
         res = [];
         res.append(((sym.fromstring('<s>',terminal=True), None, 1), ))
         for word in sent.words:
            res.append(((sym.fromstring(word,terminal=True), None, 1), ));  # (alt=word, cost=0.0)
         res.append(((sym.fromstring('</s>',terminal=True), None, 1), ))
         self.columns = tuple(res)
Exemple #17
0
    def input(self, input):
        self.rules = collections.defaultdict(list)
        for tag, attrs, i, j in input.fmeta:
            attrs = sgml.attrs_to_dict(attrs)
            if attrs.has_key('english'):
                ephrases = attrs['english'].split('|')

                if attrs.has_key('cost'):
                    costs = [float(x) for x in attrs['cost'].split('|')]
                elif attrs.has_key('prob'):
                    costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')]
                else:
                    costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform
                if len(costs) != len(ephrases):
                    sys.stderr.write("wrong number of probabilities/costs")
                    raise ValueError

                if attrs.has_key('features'):
                    features = attrs['features'].split('|')
                    if len(features) != len(ephrases):
                        sys.stderr.write("wrong number of feature names")
                        raise ValueError
                elif attrs.has_key('feature'):
                    features = [attrs['feature'] for ephrase in ephrases]
                else:
                    features = ['sgml' for ephrase in ephrases]

                if attrs.has_key('label'):
                    tags = attrs['label'].split('|')
                else:
                    tags = [tag.upper()]

                # bug: if new nonterminals are introduced at this point,
                # they will not participate in the topological sort

                for (ephrase,cost,feature) in zip(ephrases,costs,features):
                    for tag in tags:
                        r = rule.Rule(sym.fromtag(tag),
                                      rule.Phrase(input.fwords[i:j]),
                                      rule.Phrase([sym.fromstring(e) for e in ephrase.split()]),
                                      scores=svector.Vector('%s' % feature, cost))
                        self.rules[i,j].append((r,))
Exemple #18
0
    def finish(self, v, words):
        """Return a copy of v that contains only the features relevant
        to computing a score. We can also perform any necessary
        corrections to v that are possible knowing the whole
        output."""

        # Actually, for BLEU we just recompute from scratch

        # postprocessing: delete non-ASCII chars and @UNKNOWN@
        words = [sym.tostring(w) for w in words]
        words = " ".join(words)
        words = "".join(c for c in words if ord(c) < 128)
        words = [sym.fromstring(word) for word in words.split()]

        v = svector.Vector()

        cand = collections.defaultdict(int)
        for o in xrange(self.order):
            for i in xrange(len(words) - o):
                cand[tuple(words[i:i + o + 1])] += 1

        match = collections.defaultdict(int)
        for ngram in cand:
            match[len(ngram) - 1] += min(cand[ngram],
                                         self.oraclemodel.refngrams[ngram])

        for o in xrange(self.order):
            v["oracle.match%d" % o] = match[o]
            v["oracle.guess%d" % o] = max(0, len(words) - o)

        v["oracle.srclen"] = self.wordcounter.srclen
        v["oracle.candlen"] = len(words)

        if self.variant == "ibm":
            v["oracle.reflen"] = min(
                (abs(l - len(words)), l) for l in self.wordcounter.reflens)[1]
        else:
            v["oracle.reflen"] = self.wordcounter.reflen

        return v
Exemple #19
0
 def input(self, fwords, meta):
     self.ewords = [{} for f in fwords]
     for (tag, attrs, i, j) in meta:
         attrs = sgml.attrs_to_dict(attrs)
         if attrs.has_key('eword'):
             if j - i != 1:
                 log.write(
                     "warning: eword attribute given for multi-word French expression"
                 )
             ewords = [
                 sym.fromstring(e.strip())
                 for e in attrs['eword'].split('|')
             ]
             if 'cost' in attrs:
                 costs = [float(x) for x in attrs['cost'].split('|')]
             elif 'prob' in attrs:
                 costs = [
                     -math.log10(float(x)) for x in attrs['prob'].split('|')
                 ]
             else:
                 costs = [-math.log10(1.0 / len(ewords)) for e in ewords]
             self.ewords[i] = dict(zip(ewords, costs))
Exemple #20
0
    def finish(self, v, words):
        """Return a copy of v that contains only the features relevant
        to computing a score. We can also perform any necessary
        corrections to v that are possible knowing the whole
        output."""
        
        # Actually, for BLEU we just recompute from scratch

        # postprocessing: delete non-ASCII chars and @UNKNOWN@
        words = [sym.tostring(w) for w in words]
        words = " ".join(words)
        words = "".join(c for c in words if ord(c) < 128)
        words = [sym.fromstring(word) for word in words.split()]

        v = svector.Vector()

        cand = collections.defaultdict(int)
        for o in xrange(self.order):
            for i in xrange(len(words)-o):
                cand[tuple(words[i:i+o+1])] += 1

        match = collections.defaultdict(int)
        for ngram in cand:
            match[len(ngram)-1] += min(cand[ngram], self.oraclemodel.refngrams[ngram])
        
        for o in xrange(self.order):
            v["oracle.match%d" % o] = match[o]
            v["oracle.guess%d" % o] = max(0,len(words)-o)

        v["oracle.srclen"] = self.wordcounter.srclen
        v["oracle.candlen"] = len(words)
        
        if self.variant == "ibm":
            v["oracle.reflen"] = min((abs(l-len(words)), l) for l in self.wordcounter.reflens)[1]
        else:
            v["oracle.reflen"] = self.wordcounter.reflen

        return v
Exemple #21
0
def output(f):
    deriv = f.viterbi_deriv()
    hypv = deriv.vector()
    hyp = deriv.english()
    return "hyp={{{%s}}} derivation={{{%s}}} %s" % (" ".join(sym.tostring(e) for e in hyp), deriv, hypv)

for srcline, forestline, reflines in itertools.izip(srcfile, forestfile, itertools.izip(*reffiles)):
    f = forest.forest_from_text(forestline)

    # the oracle needs to know how long all the French spans are
    for item in f.bottomup():
        for ded in item.deds:
            # replace rule's French side with correct number of French words
            # we don't even bother to use the right number of variables
            ded.rule = rule.Rule(ded.rule.lhs,
                                 rule.Phrase([sym.fromstring('<foreign-word>')]*int(ded.dcost['foreign-length'])),
                                 ded.rule.e)

    f.reweight(weights)
    print "1-best %s" % output(f)

    s = sgml.Sentence(srcline.split())
    s.fwords = srcline.split()
    s.refs = [refline.split() for refline in reflines]
    theoracle.input(s, verbose=False)

    oracleweights = theoracle.make_weights(additive=True)
    # we use the in-place operations because oracleweights might be
    # a subclass of Vector
    oracleweights *= -1
    oracleweights += weights
Exemple #22
0
  - change notion of "tight" so that outer phrases are minimal and inner phrases are maximal?

"""

import sys, os, os.path
import monitor
import time, math
import random
import alignment, rule, forest
import sym
import log
import cPickle

log.level = 1

PHRASE = sym.fromstring('[PHRASE]')
START = sym.fromstring('[START]')
nonterminals = [PHRASE]


class XRule(rule.Rule):
    def __init__(self, lhs, f, e, owner=None, scores=None):
        rule.Rule.__init__(self, lhs, f, e, owner=owner, scores=scores)
        self.fpos = self.epos = self.span = None


class Feature(object):
    def __init__(self):
        object.__init__(self)

    def process_alignment(self, a):
    if opts.weightfiles is not None:
        fweightfile = file(opts.weightfiles[0], "w")
        eweightfile = file(opts.weightfiles[1], "w")

    if opts.ratiofile is not None:
        ratiofile = file(opts.ratiofile, "w")
        
    fcount = {}
    ecount = {}
    fecount = {}
    count = 0


    progress = 0
    for a in alignments:
        null = sym.fromstring("NULL")
        # Calculate lexical weights
        for i in xrange(len(a.fwords)):
            for j in xrange(len(a.ewords)):
                if a.aligned[i][j]:
                    count += 1
                    fcount[a.fwords[i]] = fcount.get(a.fwords[i],0)+1
                    ecount[a.ewords[j]] = ecount.get(a.ewords[j],0)+1
                    fecount[(a.fwords[i],a.ewords[j])] = fecount.get((a.fwords[i],a.ewords[j]),0)+1

        for i in xrange(len(a.fwords)):
            if not a.faligned[i]:
                count += 1
                fcount[a.fwords[i]] = fcount.get(a.fwords[i],0)+1
                ecount[null] = ecount.get(null,0)+1
                fecount[(a.fwords[i],null)] = fecount.get((a.fwords[i],null),0)+1
Exemple #24
0
    def dump(self, rules=None, sid=1, fsent="<foreign-sentence>", byline="", reflines=[]):

        nodememo = {}   # to keep track of sizes (# of nodes, # of edges)
        # forest id, foreign sentence (TODO: refs)

        fsent = fsent.split(" ")

        s = "%s\t%s\n" % (sid, " ".join(fsent)) + \
            "%d\n" % len(reflines) + \
            "".join(reflines)


        flen = len(words_to_chars(fsent, encode_back=True))        
        fwlen = len(fsent)

        reversed_fsent = list(reversed(fsent))  ## RIGHT TO LEFT
        
        if byline != "":
            self.traverse(0, 0, reversed_fsent, rules, nodememo)
            ## swap back
            self.adjust_spans(flen, fwlen)

            byline = byline.split(" ")
            byline_flen = self.i
            byline_fwlen = self.wi
            byline_f = fsent[:byline_fwlen]

            print >> logs, "clen (non-byline) = %d (%d)" % (flen, self.j - self.i)
            print >> logs, "wlen (non-byline) = %d (%d)" % (fwlen, self.wj - self.wi)            
            print >> logs, "BYLINE = " + " ".join(byline_f) + \
                  " ### %d chars, %d words" % (byline_flen, byline_fwlen)

            assert len(words_to_chars(byline_f)) == byline_flen, "@sentence %d, BYLINE Error" % opts.sentid ## check consistency

            ## new rule/edge
            ## TOP("by" "line" x0:TOP) -> "BY" "LINE" x0 ### id=-1

            byline_e = " ".join('"%s"' % w for w in byline)
            lhs = "TOP(" + byline_e + " x0:%s)" % self.x  # "TOP"
            rhs = " ".join('"%s"' % w for w in byline_f) + " x0"
            # byline rule, id=-1
            rid = -1
            rules[rid] = "%s -> %s ### id=%d" % (lhs, rhs, rid)

            ## make david-style LHS
            david_lhs = []
            for w in byline:
                david_lhs.append(sym.fromstring(w))
            david_lhs.append(sym.setindex(dummylabel, 1))
            
            ded = Deduction([self], rule.Rule(rid, rule.Phrase(david_lhs), rule.Phrase(david_lhs)),\
                            svector.Vector())
            ded.lhsstr = byline_e.split() + [self] ## N.B.: dont forget "..."
            ded.ruleid = rid
            
            # new node on top of TOP
            oldtop = self
            self = Item(self.x, 0, flen, deds=[ded])
            self.x = oldtop.x
            self.wi = 0
            self.wj = fwlen
            self.id = len(nodememo)+1
            nodememo[id(self)] = (self.id, nodememo[id(oldtop)][1]+1) #edges


            
        else:
            # establish node spans 
            self.traverse(0, 0, reversed_fsent, rules, nodememo)

            # swap i,j 
            self.adjust_spans(flen, fwlen)


        ## lhuang: the following is from hope.py
        ## be very careful about weights interpolation
        sg = sgml.Sentence(fsent)
        sg.fwords = fsent
        sg.refs = [refline.split(" ") for refline in reflines]

        if sg.refs:
            
            theoracle.input(sg, verbose=False)
            # 1-best
            self.reweight(weights)

            output(self, "1-best @ %s" % sid, onebestbleus, onebestscores)


            base_oracleweights = theoracle.make_weights(additive=True)
            # we use the in-place operations because oracleweights might be
            # a subclass of Vector

            for relative in []:#[opts.hope]:
                oracleweights = theoracle.make_weights(additive=True)
                oracleweights *= relative

                # interpolation: taking modelcost into account
                oracleweights += weights

                # compute oracle
                self.rescore(theoracle.models, oracleweights, add=True)
                # TODO: why??
                output(self, "hope%s  " % relative, hopebleus[relative], hopescores[relative])
            

        # right boundary should match sentence length (in chars)
        assert self.j == flen and self.wj == fwlen, \
               "@sentence %d, Boundary Mismatch at %s\t%s" % (opts.sentid, sid, fsent) + \
               "self.j=%d, flen=%d;  self.wj=%d, fwlen=%d" % (self.j, flen, self.wj, fwlen)        
        
        s += "%d\t%d\n" % nodememo[id(self)] + \
             self._dump(rules, deriv=self.viterbi_deriv())
        
        return s        
Exemple #25
0
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]):
    """Currently this assumes that the only frontier nodes in the tree are words."""
    while True:
        try:
            tok = tokiter.next()
            toktype = tok[0]
        except StopIteration:
            raise TreeFormatException("incomplete tree")

        if toktype == "or":
            _, nodeid = tok
            deds = list(forest_from_text_helper(tokiter, memo, \
                                                delete_words=delete_words))
            node = Item(dummylabel, dummyi, dummyj, deds=deds)
            if nodeid:
                memo[nodeid] = node
                node.nodeid = nodeid
            yield node

        elif toktype == "nonterm":
            _, nodeid, ruleid, dcoststr = tok
            if ruleid == "":
                ruleid = dummylabel
            else:
                # lhuang: N.B.: sym.fromtag would re-alloc it
                xrs_ruleid = int(ruleid)
                ruleid = sym.fromtag(ruleid)  #int(ruleid) #
                
            dcost = svector.Vector()
            if dcoststr:
                # lhuang: features are read from forest, not rules
                # so there is no "e^..." or "10^..."
                
                for fv in dcoststr.split(','):
                    f,v = fv.split(':',1)
                    v = float(v)
                    dcost[f] = v

            ants = []
            rhs = []
            vi = 1
            for child in forest_from_text_helper(tokiter, memo, want_item=True,\
                                                 delete_words=delete_words):
                if isinstance(child, Item):
                    ants.append(child)
                    rhs.append(sym.setindex(dummylabel, vi))
                    vi += 1
                else:
                    rhs.append(child)
            r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs))

            node = Deduction(ants=ants, rule=r, dcost=dcost)
            node.ruleid = xrs_ruleid
            
            if want_item: # need to insert OR node
                node = Item(dummylabel, dummyi, dummyj, deds=[node])
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == 'term':
            terminal = tok[1]
            if terminal not in delete_words:
                yield sym.fromstring(terminal)

        elif toktype == 'ref':
            yield memo[tok[1]]

        elif toktype == 'pop':
            return

        else:
            raise TreeFormatException("unknown token %s" % (tok,))
Exemple #26
0
                (abs(l - len(words)), l) for l in self.wordcounter.reflens)[1]
        else:
            v["oracle.reflen"] = self.wordcounter.reflen

        return v

    def clean(self, v):
        """Return a copy of v that doesn't have any of the features
           used for the oracle."""
        v = svector.Vector(v)
        for f in self.feats:
            del v[f]
        return v


HOLE = sym.fromstring("<elided>")


def make_state(ewords, order):
    if order == 1:
        return (HOLE, )
    elif len(ewords) < order:
        return ewords
    else:
        return ewords[:order - 1] + (HOLE, ) + ewords[-order + 1:]


class OracleModel(model.Model):
    def __init__(self, order=4):
        model.Model.__init__(self)
        self.order = order
Exemple #27
0
def get_cn(sentence):
    sentence = chain(('<s>',), sentence.split(), ('</s>',))
    sentence = (sym.fromstring(word, terminal=True) for word in sentence)
    return tuple(((word, None, 1), ) for word in sentence)
Exemple #28
0
    def translate(self, input):
        """input: any object that has an attribute 'words' which is a list of numberized French words. and an 'id' attribute. and an 'instruction' attribute
           output: a forest"""

        if self.decoder_age >= 100:
            self.start_decoder()

        restarts = 0
        self.decoder_age += 1
        outforest = ""
        while restarts <= 3:
            try:
                self.send_weights(input=input)
                outforest = self.instruct(input)
                if outforest == "" or not self.create_forest(
                        outforest) or self.child.poll() is not None:
                    continue
                else:
                    break
                # graehl->pust: careful - restarts += 1 doesn't happen on continue. infinite loop possible if decoder really outputs no forest (I think you changed it so a dummy forest is output, so this may be what you want? just bad for error reporting if you hang forever)
            except:
                lastexcept = log.strexcept(True)
                log.writeln("CAUGHT exception: %s" % lastexcept)
                pass
            restarts += 1
            if restarts <= 3:
                log.writeln("restarting decoder")
                self.start_decoder()
            else:
                self.start_decoder()
                #raise Exception("too many decoder restarts for %s, giving up - last was: %s"%(input,lastexcept))
                #don't raise because of global 100-retries limit in trainer.py
                log.write(
                    "too many decoder restarts, giving up on exception %s:\n%s\nwith weights:\n%s\n"
                    % (lastexcept, repr(input), self.weights))
                self.create_forest("(0<noparse:1> )")

        # self.send_instruction('weights diff "%s";' % weightstring, input)
        # self.oldweights = svector.Vector(self.weights)

        # self.send_instruction(input.instruction,input)
        # outforest = self.child.recvline()

        # restarts = 0
        # while outforest == "" or self.child.poll() is not None:
        #     log.writeln("restarting decoder")
        #     self.start_decoder()
        #     if restarts > 3:
        #         raise Exception("too many decoder restarts, giving up")
        #     self.send_instruction('weights "%s";' % weightstring, input)
        #     self.send_instruction(input.instruction, input)
        #     outforest = self.child.recvline()
        #     restarts += 1

        log.writeln("received forest: %s...%s for %s" %
                    (outforest[:80], outforest[-80:], input))
        #sys.stderr.write("received forest: %s\n" % (outforest,))

        # try:
        #     f = forest.forest_from_text(outforest, delete_words=['@UNKNOWN@'])
        # except forest.TreeFormatException:
        #     badforestf='%s/badforest.%s'%(badforestdir,input.id)
        #     log.write("dumping bad forest to %s\n" % (badforestf,))
        #     forestfile = file(badforestf, "w")
        #     forestfile.write(outforest)
        #     forestfile.close()
        #     raise

        f = self.forest
        self.forest = None
        #sys.stderr.write("internal forest: %s\n" % (forest.forest_to_text(f, mode='english')))

        for item in f.bottomup():
            for ded in item.deds:
                # replace rule's French side with correct number of French words
                # we don't even bother to use the right number of variables
                ded.rule = rule.Rule(
                    ded.rule.lhs,
                    rule.Phrase([sym.fromstring('<foreign-word>')] *
                                int(ded.dcost['foreign-length'])), ded.rule.e)

                for feature in delete_features:
                    del ded.dcost[feature]

        f.reweight(
            self.weights)  # because forest_from_text doesn't compute viterbi

        return f
Exemple #29
0
# cn.py
# Chris Dyer <*****@*****.**>
# Copyright (c) 2006 University of Maryland.

# vim:tabstop=4:autoindent:expandtab

import sys
import math
import sym
import log
import sgml

epsilon = sym.fromstring('*EPS*');

class CNStats(object):
    def __init__(self):
      self.read = 0
      self.colls = 0
      self.words = 0

    def collect(self, cn):
      self.read += 1
      self.colls += cn.get_length()
      for col in cn.columns:
        self.words += len(col)

    def __str__(self):
      return "confusion net statistics:\n succ. read: %d\n columns:    %d\n words:      %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read))

class ConfusionNet(object):
    def __init__(self, sent):
    def translate(self, input):
        """input: any object that has an attribute 'words' which is a list of numberized French words. and an 'id' attribute. and an 'instruction' attribute
           output: a forest"""

        if self.decoder_age >= 100:
            self.start_decoder()

        restarts = 0
        self.decoder_age += 1
        outforest=""
        while restarts <= 3:
            try:
                self.send_weights(input=input)
                outforest = self.instruct(input)
                if outforest == "" or not self.create_forest(outforest) or self.child.poll() is not None:
                    continue
                else:
                    break
                # graehl->pust: careful - restarts += 1 doesn't happen on continue. infinite loop possible if decoder really outputs no forest (I think you changed it so a dummy forest is output, so this may be what you want? just bad for error reporting if you hang forever)
            except:
                lastexcept=log.strexcept(True)
                log.writeln("CAUGHT exception: %s" % lastexcept)
                pass
            restarts += 1
            if restarts <= 3:
                log.writeln("restarting decoder")
                self.start_decoder()
            else:
                self.start_decoder()
                #raise Exception("too many decoder restarts for %s, giving up - last was: %s"%(input,lastexcept))
                #don't raise because of global 100-retries limit in trainer.py
                log.write("too many decoder restarts, giving up on exception %s:\n%s\nwith weights:\n%s\n" % (lastexcept,repr(input),self.weights))
                self.create_forest("(0<noparse:1> )")


        # self.send_instruction('weights diff "%s";' % weightstring, input)
        # self.oldweights = svector.Vector(self.weights)

        # self.send_instruction(input.instruction,input)
        # outforest = self.child.recvline()

        # restarts = 0
        # while outforest == "" or self.child.poll() is not None:
        #     log.writeln("restarting decoder")
        #     self.start_decoder()
        #     if restarts > 3:
        #         raise Exception("too many decoder restarts, giving up")
        #     self.send_instruction('weights "%s";' % weightstring, input)
        #     self.send_instruction(input.instruction, input)
        #     outforest = self.child.recvline()
        #     restarts += 1

        log.writeln("received forest: %s...%s for %s" % (outforest[:80],outforest[-80:], input))
        #sys.stderr.write("received forest: %s\n" % (outforest,))

        # try:
        #     f = forest.forest_from_text(outforest, delete_words=['@UNKNOWN@'])
        # except forest.TreeFormatException:
        #     badforestf='%s/badforest.%s'%(badforestdir,input.id)
        #     log.write("dumping bad forest to %s\n" % (badforestf,))
        #     forestfile = file(badforestf, "w")
        #     forestfile.write(outforest)
        #     forestfile.close()
        #     raise

        f = self.forest
        self.forest = None
        #sys.stderr.write("internal forest: %s\n" % (forest.forest_to_text(f, mode='english')))

        for item in f.bottomup():
            for ded in item.deds:
                # replace rule's French side with correct number of French words
                # we don't even bother to use the right number of variables
                ded.rule = rule.Rule(ded.rule.lhs,
                                     rule.Phrase([sym.fromstring('<foreign-word>')]*int(ded.dcost['foreign-length'])),
                                     ded.rule.e)

                for feature in delete_features:
                    del ded.dcost[feature]

        f.reweight(self.weights) # because forest_from_text doesn't compute viterbi

        return f
  - change notion of "tight" so that outer phrases are minimal and inner phrases are maximal?

"""

import sys, os, os.path
import monitor
import time, math
import random
import alignment, rule, forest
import sym
import log
import cPickle

log.level = 1

PHRASE = sym.fromstring('[PHRASE]')
START = sym.fromstring('[START]')
nonterminals = [PHRASE]

class XRule(rule.Rule):
    def __init__(self, lhs, f, e, owner=None, scores=None):
        rule.Rule.__init__(self, lhs, f, e, owner=owner, scores=scores)
        self.fpos = self.epos = self.span = None

class Feature(object):
    def __init__(self):
        object.__init__(self)

    def process_alignment(self, a):
        pass
Exemple #32
0
    if opts.weightfiles is not None:
        fweightfile = file(opts.weightfiles[0], "w")
        eweightfile = file(opts.weightfiles[1], "w")

    if opts.ratiofile is not None:
        ratiofile = file(opts.ratiofile, "w")

    fcount = {}
    ecount = {}
    fecount = {}
    count = 0

    progress = 0
    for a in alignments:
        null = sym.fromstring("NULL")
        # Calculate lexical weights
        for i in xrange(len(a.fwords)):
            for j in xrange(len(a.ewords)):
                if a.aligned[i][j]:
                    count += 1
                    fcount[a.fwords[i]] = fcount.get(a.fwords[i], 0) + 1
                    ecount[a.ewords[j]] = ecount.get(a.ewords[j], 0) + 1
                    fecount[(a.fwords[i], a.ewords[j])] = fecount.get(
                        (a.fwords[i], a.ewords[j]), 0) + 1

        for i in xrange(len(a.fwords)):
            if not a.faligned[i]:
                count += 1
                fcount[a.fwords[i]] = fcount.get(a.fwords[i], 0) + 1
                ecount[null] = ecount.get(null, 0) + 1
Exemple #33
0
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]):
    """Currently this assumes that the only frontier nodes in the tree are words."""
    while True:
        try:
            tok = tokiter.next()
            toktype = tok[0]
        except StopIteration:
            raise TreeFormatException("incomplete tree")

        if toktype == "or":
            _, nodeid = tok
            deds = list(
                forest_from_text_helper(tokiter,
                                        memo,
                                        delete_words=delete_words))
            node = Item(dummylabel, dummyi, dummyj, deds=deds)
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == "nonterm":
            _, nodeid, ruleid, dcoststr = tok
            if ruleid == "":
                ruleid = dummylabel
            else:
                ruleid = sym.fromtag(ruleid)
            dcost = svector.Vector()
            if dcoststr:
                for fv in dcoststr.split(','):
                    f, v = fv.split(':', 1)
                    v = float(v)
                    dcost[f] = v

            ants = []
            rhs = []
            vi = 1
            for child in forest_from_text_helper(tokiter,
                                                 memo,
                                                 want_item=True,
                                                 delete_words=delete_words):
                if isinstance(child, Item):
                    ants.append(child)
                    rhs.append(sym.setindex(dummylabel, vi))
                    vi += 1
                else:
                    rhs.append(child)
            r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs))

            node = Deduction(ants=ants, rule=r, dcost=dcost)
            if want_item:  # need to insert OR node
                node = Item(dummylabel, dummyi, dummyj, deds=[node])
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == 'term':
            terminal = tok[1]
            if terminal not in delete_words:
                yield sym.fromstring(terminal)

        elif toktype == 'ref':
            yield memo[tok[1]]

        elif toktype == 'pop':
            return

        else:
            raise TreeFormatException("unknown token %s" % (tok, ))
Exemple #34
0
# Copyright (c) 2004-2006 University of Maryland. All rights
# reserved. Do not redistribute without permission from the
# author. Not for commercial use.

import sys, os, os.path

import monitor
import math
import heapq

import sym, rule, cost
import log
from filter import Filter
log.level = 1

PHRASE = sym.fromstring('[PHRASE]')

P_IMPROBABLE = 1e-7  # weights are recorded in our files to the 1e-6 place


def costfromprob(p):
    try:
        return -math.log10(p)
    except (ValueError, OverflowError):
        return cost.IMPOSSIBLE


profile = False

if not profile:
    try:
Exemple #35
0
# Copyright (c) 2004-2006 University of Maryland. All rights
# reserved. Do not redistribute without permission from the
# author. Not for commercial use.

import sys, os, os.path

import monitor
import math
import heapq

import sym, rule, cost
import log
from filter import Filter
log.level = 1

PHRASE = sym.fromstring('[PHRASE]')

P_IMPROBABLE = 1e-7 # weights are recorded in our files to the 1e-6 place

def costfromprob(p):
    try:
        return -math.log10(p)
    except (ValueError, OverflowError):
        return cost.IMPOSSIBLE

profile = False

if not profile:
    try:
        import psyco
        psyco.profile()
Exemple #36
0
        if self.variant == "ibm":
            v["oracle.reflen"] = min((abs(l-len(words)), l) for l in self.wordcounter.reflens)[1]
        else:
            v["oracle.reflen"] = self.wordcounter.reflen

        return v

    def clean(self, v):
        """Return a copy of v that doesn't have any of the features
           used for the oracle."""
        v = svector.Vector(v)
        for f in self.feats:
            del v[f]
        return v

HOLE = sym.fromstring("<elided>")

def make_state(ewords, order):
    if order == 1:
        return (HOLE,)
    elif len(ewords) < order:
        return ewords
    else:
        return ewords[:order-1] + (HOLE,) + ewords[-order+1:]

class OracleModel(model.Model):
    def __init__(self, order=4):
        model.Model.__init__(self)
        self.order = order
        self.feat = ["oracle.match%d" % o for o in xrange(order)]