def score_rule(self, a, r):
        fweight = eweight = 1.0
        fratio = 0.0
        
        for i in xrange(len(r.f)):
            if not sym.isvar(r.f[i]):
                if self.fweights is not None:
                    fweight *= self.fweights[r.fpos[i]]
                if self.fratios is not None:
                    fratio += self.fratios[r.fpos[i]]

        for i in xrange(len(r.e)):
            if not sym.isvar(r.e[i]):
                if self.eweights is not None:
                    eweight *= self.eweights[r.epos[i]]

        scores = []
        if self.fweights is not None:
            scores.append(fweight)
        if self.eweights is not None:
            scores.append(eweight)
        if self.fratios is not None:
            scores.append(fratio)

        return scores
Beispiel #2
0
    def score_rule(self, a, r):
        fweight = eweight = 1.0
        fratio = 0.0

        for i in xrange(len(r.f)):
            if not sym.isvar(r.f[i]):
                if self.fweights is not None:
                    fweight *= self.fweights[r.fpos[i]]
                if self.fratios is not None:
                    fratio += self.fratios[r.fpos[i]]

        for i in xrange(len(r.e)):
            if not sym.isvar(r.e[i]):
                if self.eweights is not None:
                    eweight *= self.eweights[r.epos[i]]

        scores = []
        if self.fweights is not None:
            scores.append(fweight)
        if self.eweights is not None:
            scores.append(eweight)
        if self.fratios is not None:
            scores.append(fratio)

        return scores
Beispiel #3
0
 def feature(fphrase, ephrase, paircount, fcount, fsample_count):
     fwords = (sym.tostring(w) for w in fphrase if not sym.isvar(w))
     ewords = [sym.tostring(w) for w in ephrase if not sym.isvar(w)] + ['NULL']
     def score():
         for f in fwords:
           maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
           yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
     return sum(score())
Beispiel #4
0
    def score_rule(self, a, r):
        funaligned = eunaligned = 0

        for i in xrange(len(r.f)):
            if not sym.isvar(r.f[i]):
                if not a.faligned[r.fpos[i]]:
                    funaligned += 1

        for i in xrange(len(r.e)):
            if not sym.isvar(r.e[i]):
                if not a.ealigned[r.epos[i]]:
                    eunaligned += 1

        return [funaligned, eunaligned]
    def score_rule(self, a, r):
        funaligned = eunaligned = 0

        for i in xrange(len(r.f)):
            if not sym.isvar(r.f[i]):
                if not a.faligned[r.fpos[i]]:
                    funaligned += 1

        for i in xrange(len(r.e)):
            if not sym.isvar(r.e[i]):
                if not a.ealigned[r.epos[i]]:
                    eunaligned += 1

        return [funaligned, eunaligned]
Beispiel #6
0
 def _tree_helper(t, antvalues):
     t = tree.str_to_tree(t)
     for node in t.frontier():
         x = sym.fromstring(node.label)
         if sym.isvar(x):
             node.insert_child(0, antvalues[sym.getindex(x) - 1])
     return t
Beispiel #7
0
def _ded_to_text(node, result, memo, mode=None, weights=None):
    # Convert rule and features into single tokens
    #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost)
    vstr = "cost:%s" % weights.dot(node.dcost)
    #rstr = id(node.rule)
    rstr = id(node)
    s = "%s<%s>" % (rstr,vstr)
    if False and len(node.ants) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id
        result.append(s)
    else:
        result.append('(')
        result.append(s)
        if mode == 'french':
            children = node.rule.f if node.rule else node.ants
        elif mode == 'english':
            children = node.rule.e if node.rule else node.ants
        else:
            children = node.ants

        for child in children:
            if isinstance(child, Item):
                result.append(' ')
                _item_to_text(child, result, memo, mode=mode, weights=weights)
            elif sym.isvar(child):
                result.append(' ')
                _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights)
            else:
                result.append(' ')
                result.append(quoteattr(sym.tostring(child)))
        result.append(')')
Beispiel #8
0
        def visit(item):
            ded = self.ded[id(item)]
            if ded.rule:
                align = collections.defaultdict(list)
                if 'align' in ded.rule.attrs:
                    for fi, ei in ded.rule.attrs['align']:
                        align[ei].append(fi)

                result = []
                j1 = None
                for ei, e in enumerate(ded.rule.e):
                    if sym.isvar(e):
                        result.extend(visit(ded.ants[sym.getindex(e) - 1]))
                    else:
                        if len(ded.ants) == 2:
                            j1 = ded.ants[0].j
                        else:
                            j1 = None
                        result.append([
                            ded.rule.f.stringpos(fi, item.i, item.j, j1)
                            for fi in align[ei]
                        ])
                print ded.rule, item.i, item.j, j1, result
                return result
            else:
                return visit(ded.ants[0])
Beispiel #9
0
def _ded_to_xml(node, result, memo, mode, models, weights):
    if weights:
        result.append('<and label=%s cost=%s>' % (xml.sax.saxutils.quoteattr(str(id(node.rule))),
                                                  xml.sax.saxutils.quoteattr(str(weights.dot(node.dcost)))))
    else:
        result.append('<and label=%s>' % (xml.sax.saxutils.quoteattr(str(id(node)))))

    result.append('<features>')
    for f,v in node.dcost.iteritems():
        result.append('<feature name=%s value=%s/>' % (xml.sax.saxutils.quoteattr(f), xml.sax.saxutils.quoteattr(str(v))))
    result.append('</features>')

    if mode == 'french':
        children = node.rule.f if node.rule else node.ants
    elif mode == 'english':
        children = node.rule.e if node.rule else node.ants
    else:
        children = node.ants

    for child in children:
        if isinstance(child, Item):
            _item_to_xml(child, result, memo, mode=mode, models=models, weights=weights)
        elif sym.isvar(child):
            _item_to_xml(node.ants[sym.getindex(child)-1], result, memo, mode=mode, models=models, weights=weights)
        else:
            result.append('<leaf label=%s/>' % xml.sax.saxutils.quoteattr(sym.tostring(child)))
    result.append('</and>')
Beispiel #10
0
 def estimate(self, r):
     total = 0.0
     for e in r.e:
         if not sym.isvar(e):
             subtotal = 0.0
             te = self.ttable.get(e, None)
             if te is None:
                 continue
             l = 0
             for f in r.f:
                 if not sym.isvar(f):
                     subtotal += te.get(f, self.epsilon)
                     l += 1
             subtotal += te.get(None, self.epsilon)
             total += -math.log10(subtotal / (l + 1))
     return total
Beispiel #11
0
 def estimate(self, r):
     total = 0.0
     for e in r.e:
         if not sym.isvar(e):
             subtotal = 0.0
             te = self.ttable.get(e, None)
             if te is None:
                 continue
             l = 0
             for f in r.f:
                 if not sym.isvar(f):
                     subtotal += te.get(f, self.epsilon)
                     l += 1
             subtotal += te.get(None, self.epsilon)
             total += -math.log10(subtotal/(l+1))
     return total
Beispiel #12
0
 def _tree_helper(t, antvalues):
     t = tree.str_to_tree(t)
     for node in t.frontier():
         x = sym.fromstring(node.label)
         if sym.isvar(x):
             node.insert_child(0, antvalues[sym.getindex(x)-1])
     return t
Beispiel #13
0
 def _fake_tree_helper(lhs, rhs, antvalues):
     children = []
     for x in rhs:
         if sym.isvar(x):
             children.append(antvalues[sym.getindex(x) - 1])
         else:
             children.append(tree.Node(sym.tostring(x), []))
     return tree.Node(sym.totag(lhs), children)
Beispiel #14
0
 def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
   totalscore = 1.0
   fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase))
   ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase))
   ewords.append("NULL")
   for f in fwords:
     maxScore = 0.0
     for e in ewords:
       score = self.ttable.get_score(f, e, self.col)
       #print "score(MaxLexFgivenE) = ",score
       if score > maxScore:
         maxScore = score
     totalscore *= maxScore
   if totalscore == 0.0:
     return 999
   else:
     return -math.log10(totalscore)
Beispiel #15
0
 def _fake_tree_helper(lhs, rhs, antvalues):
     children = []
     for x in rhs:
         if sym.isvar(x):
             children.append(antvalues[sym.getindex(x)-1])
         else:
             children.append(tree.Node(sym.tostring(x), []))
     return tree.Node(sym.totag(lhs), children)
Beispiel #16
0
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None):
    result = []
    result.append('{\n')

    if fwords:
        fwords = [(sym.tostring(fword) if type(fword) is int else fword)
                  for fword in fwords]
        result.append('  "source": [%s],\n' %
                      ",".join(quotejson(fword) for fword in fwords))

    items = list(root)
    nodeindex = {}
    nodestrs = []
    for ni, item in enumerate(items):
        nodeindex[item] = ni
        if item is root:
            ri = ni
        if item.x is None:
            nodestrs.append('    {}')
        else:
            nodestrs.append('    {"label": %s}' % quotejson(sym.totag(item.x)))
    result.append('  "nodes": [\n%s\n  ],\n' % ",\n".join(nodestrs))

    result.append('  "root": %d,\n' % ri)

    edgestrs = []
    for ni, item in enumerate(items):
        for ded in item.deds:
            tailstrs = []

            if mode == 'french':
                children = ded.rule.f if ded.rule else ded.ants
            elif mode == 'english':
                children = ded.rule.e if ded.rule else ded.ants
            else:
                children = ded.ants

            for child in children:
                if isinstance(child, Item):
                    tailstrs.append(str(nodeindex[child]))
                elif sym.isvar(child):
                    ant = ded.ants[sym.getindex(child) - 1]
                    tailstrs.append(str(nodeindex[ant]))
                else:
                    tailstrs.append(quotejson(sym.tostring(child)))

            dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f), v)
                                         for (f, v) in ded.dcost.iteritems())
            edgestrs.append(
                '    {"head": %s, "tails": [%s], "features": %s}\n' %
                (ni, ",".join(tailstrs), dcoststr))

    result.append('  "edges": [\n%s\n  ]\n' % ",\n".join(edgestrs))

    result.append('}')
    return "".join(result)
Beispiel #17
0
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None):
    result = []
    result.append('{\n')

    if fwords:
        fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords]
        result.append('  "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords))

    items = list(root)
    nodeindex = {}
    nodestrs = []
    for ni,item in enumerate(items):
        nodeindex[item] = ni
        if item is root:
            ri = ni
        if item.x is None:
            nodestrs.append('    {}')
        else:
            nodestrs.append('    {"label": %s}' % quotejson(sym.totag(item.x)))
    result.append('  "nodes": [\n%s\n  ],\n' % ",\n".join(nodestrs))

    result.append('  "root": %d,\n' % ri)

    edgestrs = []
    for ni,item in enumerate(items):
        for ded in item.deds:
            tailstrs = []

            if mode == 'french':
                children = ded.rule.f if ded.rule else ded.ants
            elif mode == 'english':
                children = ded.rule.e if ded.rule else ded.ants
            else:
                children = ded.ants

            for child in children:
                if isinstance(child, Item):
                    tailstrs.append(str(nodeindex[child]))
                elif sym.isvar(child):
                    ant = ded.ants[sym.getindex(child)-1]
                    tailstrs.append(str(nodeindex[ant]))
                else:
                    tailstrs.append(quotejson(sym.tostring(child)))

            dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f),v) for (f,v) in ded.dcost.iteritems())
            edgestrs.append('    {"head": %s, "tails": [%s], "features": %s}\n' % (
                    ni,
                    ",".join(tailstrs),
                    dcoststr))

    result.append('  "edges": [\n%s\n  ]\n' % ",\n".join(edgestrs))

    result.append('}')
    return "".join(result)
Beispiel #18
0
 def transition(self, r, antstates, i, j, j1=None):
     total = 0.0
     for e in r.e:
         if not sym.isvar(e):
             subtotal = 0.0
             te = self.ttable.get(e, None)
             if te is None:
                 continue
             for f in self.fwords:
                 subtotal += te.get(f, self.epsilon)
             subtotal += te.get(None, self.epsilon)
             total += -math.log10(subtotal/(len(self.fwords)+1))
     return (None, total)
Beispiel #19
0
 def transition(self, r, antstates, i, j, j1=None):
     total = 0.0
     for e in r.e:
         if not sym.isvar(e):
             subtotal = 0.0
             te = self.ttable.get(e, None)
             if te is None:
                 continue
             for f in self.fwords:
                 subtotal += te.get(f, self.epsilon)
             subtotal += te.get(None, self.epsilon)
             total += -math.log10(subtotal / (len(self.fwords) + 1))
     return (None, total)
Beispiel #20
0
def _ded_to_xml(node, result, memo, mode, models, weights):
    if weights:
        result.append(
            '<and label=%s cost=%s>' %
            (xml.sax.saxutils.quoteattr(str(id(node.rule))),
             xml.sax.saxutils.quoteattr(str(weights.dot(node.dcost)))))
    else:
        result.append('<and label=%s>' %
                      (xml.sax.saxutils.quoteattr(str(id(node)))))

    result.append('<features>')
    for f, v in node.dcost.iteritems():
        result.append('<feature name=%s value=%s/>' %
                      (xml.sax.saxutils.quoteattr(f),
                       xml.sax.saxutils.quoteattr(str(v))))
    result.append('</features>')

    if mode == 'french':
        children = node.rule.f if node.rule else node.ants
    elif mode == 'english':
        children = node.rule.e if node.rule else node.ants
    else:
        children = node.ants

    for child in children:
        if isinstance(child, Item):
            _item_to_xml(child,
                         result,
                         memo,
                         mode=mode,
                         models=models,
                         weights=weights)
        elif sym.isvar(child):
            _item_to_xml(node.ants[sym.getindex(child) - 1],
                         result,
                         memo,
                         mode=mode,
                         models=models,
                         weights=weights)
        else:
            result.append('<leaf label=%s/>' %
                          xml.sax.saxutils.quoteattr(sym.tostring(child)))
    result.append('</and>')
Beispiel #21
0
 def add(self, r, estcost=0.):
     if r.f.arity() == 1 and len(r.f) == 1:
         log.write("unary rule: %s\n" % r)
         self.unary_rules.setdefault(sym.clearindex(r.f[0]), RuleBin(self.threshold, self.limit)).add(estcost, r)
         self.unary_less_than.add((sym.clearindex(r.f[0]), r.lhs))
     else:
         cur = self.root
         for f in r.f:
             if sym.isvar(f):
                 f = sym.clearindex(f)
             cur[1].setdefault(f, [None, {}])
             cur = cur[1][f]
         if cur[0] is None:
             cur[0] = RuleBin(self.threshold, self.limit)
             self.rulebin_count += 1
         bin = cur[0]
         bin.add(estcost, r)
         bin.prune()
     self.count += 1
Beispiel #22
0
def _ded_to_text(node, result, memo, mode=None, weights=None):
    # Convert rule and features into single tokens
    #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost)
    # lhuang: in case no weights
    vstr = "cost:%s" % weights.dot(node.dcost) if weights is not None \
           else "_"
    rstr = id(node.rule)
    #rstr = id(node)
    s = "ruleid=%s<value=%s>" % (rstr,vstr)
    print "\truleid=%s" % rstr,
    
    if False and len(node.ants) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id
        result.append(s)
    else:
        result.append('(')
        result.append(s)
        if mode == 'french':
            children = node.rule.f if node.rule else node.ants
        elif mode == 'english':
            # lhuang: default mode: english side
            children = node.rule.e if node.rule else node.ants
        else:
            children = node.ants

        for child in children:
            if isinstance(child, Item):
                result.append(' it ')
                _item_to_text(child, result, memo, mode=mode, weights=weights)
            elif sym.isvar(child):
                # lhuang: variable, do recursion
                result.append(' var ')
                _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights)
            else:
                # lhuang: english word
                result.append(' word ')
                w = quoteattr(sym.tostring(child))
                result.append(w)
                print w,
        result.append(')')

    print # end of a hyperedge
Beispiel #23
0
        def visit(item):
            ded = self.ded[id(item)]
            if ded.rule:
                align = collections.defaultdict(list)
                if 'align' in ded.rule.attrs:
                    for fi, ei in ded.rule.attrs['align']:
                        align[ei].append(fi)

                result = []
                j1 = None
                for ei, e in enumerate(ded.rule.e):
                    if sym.isvar(e):
                        result.extend(visit(ded.ants[sym.getindex(e)-1]))
                    else:
                        if len(ded.ants) == 2:
                            j1 = ded.ants[0].j
                        else:
                            j1 = None
                        result.append([ded.rule.f.stringpos(fi, item.i, item.j, j1) for fi in align[ei]])
                print ded.rule, item.i, item.j, j1, result
                return result
            else:
                return visit(ded.ants[0])
Beispiel #24
0
def _ded_to_text(node, result, memo, mode=None, weights=None):
    # Convert rule and features into single tokens
    #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost)
    vstr = "cost:%s" % weights.dot(node.dcost)
    #rstr = id(node.rule)
    rstr = id(node)
    s = "%s<%s>" % (rstr, vstr)
    if False and len(
            node.ants
    ) == 0:  # the format allows this but only if we don't tag with an id. but we tag everything with an id
        result.append(s)
    else:
        result.append('(')
        result.append(s)
        if mode == 'french':
            children = node.rule.f if node.rule else node.ants
        elif mode == 'english':
            children = node.rule.e if node.rule else node.ants
        else:
            children = node.ants

        for child in children:
            if isinstance(child, Item):
                result.append(' ')
                _item_to_text(child, result, memo, mode=mode, weights=weights)
            elif sym.isvar(child):
                result.append(' ')
                _item_to_text(node.ants[sym.getindex(child) - 1],
                              result,
                              memo,
                              mode=mode,
                              weights=weights)
            else:
                result.append(' ')
                result.append(quoteattr(sym.tostring(child)))
        result.append(')')
Beispiel #25
0
    def traverse(self, right_idx=0, right_widx=0, fsent=None, rules=None, nodememo=None):        
        ''' helper called by dump(); returns a string; figure out span'''

        if nodememo is None:
            nodememo = {}

        if id(self) in nodememo:
            return

        deds = [(ded.dcost.dot(weights), ded) for ded in self.deds]
        deds.sort()
        
        deds = [x for _, x in deds[:max_edges_per_node]]
        self.deds = deds # prune!

        nedges = len(deds)  # accumulating number of edges, recursively
        
        self.i = right_idx
        self.wi = right_widx

        for dedid, ded in enumerate(deds):
            try:
                rule = rules[ded.ruleid]
            except:
                print >> sys.stderr, "WARNING: rule %d not found" % ded.ruleid
                ## assuming it's a one-word UNKNOWN rule
                ## TODO: check with lattice
                unkword = fsent[self.wi]
                rule = 'UNKNOWN("@UNKNOWN@") -> "%s"' % unkword  # in reverse order
                rules[ded.ruleid] = rule
                print >> sys.stderr, "         covering " + unkword
                
                
            self.x = rule.split("(", 1)[0]  # non-terminal label

            # analyse RHS (chinese side)
            lhs, rhs = rule.split(" -> ", 1) ## -> might be a word

            # deal with lhs; convert to ded.lhsstr = ["...", "...", Item(...), "..."]
            varid = 0
            lhsstr = []
            for child in ded.rule.e:
                if sym.isvar(child):
                    lhsstr.append(ded.ants[varid])
                    varid += 1
                else:
                    lhsstr.append(quoteattr(sym.tostring(child)))

            # will be used in _dump()
            ded.lhsstr = lhsstr                
            
            vars = []
            chars_in_gap = 0
            words_in_gap = 0
            for it in reversed(rhs.split()):  ## from RIGHT to LEFT!! N.B. can't split(" ")
                if it[0] == "x":
                    #variable:
                    var = int(it[1:])
                    vars.append((var, chars_in_gap, words_in_gap))
                    chars_in_gap = 0
                    words_in_gap = 0
                else:
                    # strip off quotes "..."
                    it = it[1:-1]
                    # calculate char-length
                    if it == foreign_sentence_tag: # <foreign-sentence>:
                        # glue symbol is not counted!
                        chars_in_gap += 0
                        words_in_gap += 0
                    else:
                        # 1 for word, len(...) for char
                        chars_in_gap += len(words_to_chars(it, encode_back=True)) 
                        words_in_gap += 1

            accumu = self.i  ## left boundary
            waccumu = self.wi
            for i, c_gap, w_gap in vars:
            ##for sub in ded.ants:
                sub = ded.ants[i]
                if id(sub) not in nodememo:
                    sub.traverse(accumu + c_gap, waccumu + w_gap, fsent, rules, nodememo)
                    # accumulating # of edges (if first seen)
                    nedges += nodememo[id(sub)][1]

                ## don't accumulate subs now; will do in another visit
##                s += subs
                accumu = sub.j
                waccumu = sub.wj

            tmp_j = (ded.ants[vars[-1][0]].j if vars != [] else self.i) + chars_in_gap
            if self.j is not None and self.j != tmp_j:
                assert False, "@sentence %d, node #%s, %d %d != %d %s rule %d" % \
                       (opts.sentid, self.nodeid, self.i, self.j, tmp_j, self.x, ded.ruleid)
            self.j = tmp_j

            tmp_wj = (ded.ants[vars[-1][0]].wj if vars != [] else self.wi) + words_in_gap ##
            self.wj = tmp_wj
                
        self.id = len(nodememo) + 1
        nodememo[id(self)] = (self.id, nedges)
Beispiel #26
0
    for line in sys.stdin:
        r = rule.rule_from_line(line)
        if r.word_alignments is None:
            scores = r.scores
            scores.extend([scores[0], scores[0]])
            r.scores = scores
            sys.stdout.write("%s\n" % r.to_line())
            progress += 1
            continue

        align = set(r.word_alignments)

        fweight = eweight = 1.0

        for fi in xrange(len(r.f)):
            if not sym.isvar(r.f[fi]):
                fwordweight = 0.
                n = 0
                for ei in xrange(len(r.e)):
                    if (fi, ei) in align:
                        try:
                            fwordweight += fweighttable[r.f[fi]][r.e[ei]]
                        except KeyError:
                            fwordweight += threshold
                        n += 1
                if n > 0:
                    fweight *= fwordweight / n
                else:
                    try:
                        fweight = fweighttable[r.f[fi]][None]
                    except KeyError:
    for line in sys.stdin:
        r = rule.rule_from_line(line)
        if r.word_alignments is None:
            scores = r.scores
            scores.extend([scores[0],scores[0]])
            r.scores = scores
            sys.stdout.write("%s\n" % r.to_line())
            progress += 1
            continue
        
        align = set(r.word_alignments)

        fweight = eweight = 1.0
        
        for fi in xrange(len(r.f)):
            if not sym.isvar(r.f[fi]):
                fwordweight = 0.
                n = 0
                for ei in xrange(len(r.e)):
                    if (fi, ei) in align:
                        try:
                            fwordweight += fweighttable[r.f[fi]][r.e[ei]]
                        except KeyError:
                            fwordweight += threshold
                        n += 1
                if n > 0:
                    fweight *= fwordweight / n
                else:
                    try:
                        fweight = fweighttable[r.f[fi]][None]
                    except KeyError: