コード例 #1
0
ファイル: convert_forest.py プロジェクト: rupenp/transforest
    def traverse(self, right_idx=0, right_widx=0, fsent=None, rules=None, nodememo=None):        
        ''' helper called by dump(); returns a string; figure out span'''

        if nodememo is None:
            nodememo = {}

        if id(self) in nodememo:
            return

        deds = [(ded.dcost.dot(weights), ded) for ded in self.deds]
        deds.sort()
        
        deds = [x for _, x in deds[:max_edges_per_node]]
        self.deds = deds # prune!

        nedges = len(deds)  # accumulating number of edges, recursively
        
        self.i = right_idx
        self.wi = right_widx

        for dedid, ded in enumerate(deds):
            try:
                rule = rules[ded.ruleid]
            except:
                print >> sys.stderr, "WARNING: rule %d not found" % ded.ruleid
                ## assuming it's a one-word UNKNOWN rule
                ## TODO: check with lattice
                unkword = fsent[self.wi]
                rule = 'UNKNOWN("@UNKNOWN@") -> "%s"' % unkword  # in reverse order
                rules[ded.ruleid] = rule
                print >> sys.stderr, "         covering " + unkword
                
                
            self.x = rule.split("(", 1)[0]  # non-terminal label

            # analyse RHS (chinese side)
            lhs, rhs = rule.split(" -> ", 1) ## -> might be a word

            # deal with lhs; convert to ded.lhsstr = ["...", "...", Item(...), "..."]
            varid = 0
            lhsstr = []
            for child in ded.rule.e:
                if sym.isvar(child):
                    lhsstr.append(ded.ants[varid])
                    varid += 1
                else:
                    lhsstr.append(quoteattr(sym.tostring(child)))

            # will be used in _dump()
            ded.lhsstr = lhsstr                
            
            vars = []
            chars_in_gap = 0
            words_in_gap = 0
            for it in reversed(rhs.split()):  ## from RIGHT to LEFT!! N.B. can't split(" ")
                if it[0] == "x":
                    #variable:
                    var = int(it[1:])
                    vars.append((var, chars_in_gap, words_in_gap))
                    chars_in_gap = 0
                    words_in_gap = 0
                else:
                    # strip off quotes "..."
                    it = it[1:-1]
                    # calculate char-length
                    if it == foreign_sentence_tag: # <foreign-sentence>:
                        # glue symbol is not counted!
                        chars_in_gap += 0
                        words_in_gap += 0
                    else:
                        # 1 for word, len(...) for char
                        chars_in_gap += len(words_to_chars(it, encode_back=True)) 
                        words_in_gap += 1

            accumu = self.i  ## left boundary
            waccumu = self.wi
            for i, c_gap, w_gap in vars:
            ##for sub in ded.ants:
                sub = ded.ants[i]
                if id(sub) not in nodememo:
                    sub.traverse(accumu + c_gap, waccumu + w_gap, fsent, rules, nodememo)
                    # accumulating # of edges (if first seen)
                    nedges += nodememo[id(sub)][1]

                ## don't accumulate subs now; will do in another visit
##                s += subs
                accumu = sub.j
                waccumu = sub.wj

            tmp_j = (ded.ants[vars[-1][0]].j if vars != [] else self.i) + chars_in_gap
            if self.j is not None and self.j != tmp_j:
                assert False, "@sentence %d, node #%s, %d %d != %d %s rule %d" % \
                       (opts.sentid, self.nodeid, self.i, self.j, tmp_j, self.x, ded.ruleid)
            self.j = tmp_j

            tmp_wj = (ded.ants[vars[-1][0]].wj if vars != [] else self.wi) + words_in_gap ##
            self.wj = tmp_wj
                
        self.id = len(nodememo) + 1
        nodememo[id(self)] = (self.id, nedges)
コード例 #2
0
ファイル: convert_forest.py プロジェクト: rupenp/transforest
    def dump(self, rules=None, sid=1, fsent="<foreign-sentence>", byline="", reflines=[]):

        nodememo = {}   # to keep track of sizes (# of nodes, # of edges)
        # forest id, foreign sentence (TODO: refs)

        fsent = fsent.split(" ")

        s = "%s\t%s\n" % (sid, " ".join(fsent)) + \
            "%d\n" % len(reflines) + \
            "".join(reflines)


        flen = len(words_to_chars(fsent, encode_back=True))        
        fwlen = len(fsent)

        reversed_fsent = list(reversed(fsent))  ## RIGHT TO LEFT
        
        if byline != "":
            self.traverse(0, 0, reversed_fsent, rules, nodememo)
            ## swap back
            self.adjust_spans(flen, fwlen)

            byline = byline.split(" ")
            byline_flen = self.i
            byline_fwlen = self.wi
            byline_f = fsent[:byline_fwlen]

            print >> logs, "clen (non-byline) = %d (%d)" % (flen, self.j - self.i)
            print >> logs, "wlen (non-byline) = %d (%d)" % (fwlen, self.wj - self.wi)            
            print >> logs, "BYLINE = " + " ".join(byline_f) + \
                  " ### %d chars, %d words" % (byline_flen, byline_fwlen)

            assert len(words_to_chars(byline_f)) == byline_flen, "@sentence %d, BYLINE Error" % opts.sentid ## check consistency

            ## new rule/edge
            ## TOP("by" "line" x0:TOP) -> "BY" "LINE" x0 ### id=-1

            byline_e = " ".join('"%s"' % w for w in byline)
            lhs = "TOP(" + byline_e + " x0:%s)" % self.x  # "TOP"
            rhs = " ".join('"%s"' % w for w in byline_f) + " x0"
            # byline rule, id=-1
            rid = -1
            rules[rid] = "%s -> %s ### id=%d" % (lhs, rhs, rid)

            ## make david-style LHS
            david_lhs = []
            for w in byline:
                david_lhs.append(sym.fromstring(w))
            david_lhs.append(sym.setindex(dummylabel, 1))
            
            ded = Deduction([self], rule.Rule(rid, rule.Phrase(david_lhs), rule.Phrase(david_lhs)),\
                            svector.Vector())
            ded.lhsstr = byline_e.split() + [self] ## N.B.: dont forget "..."
            ded.ruleid = rid
            
            # new node on top of TOP
            oldtop = self
            self = Item(self.x, 0, flen, deds=[ded])
            self.x = oldtop.x
            self.wi = 0
            self.wj = fwlen
            self.id = len(nodememo)+1
            nodememo[id(self)] = (self.id, nodememo[id(oldtop)][1]+1) #edges


            
        else:
            # establish node spans 
            self.traverse(0, 0, reversed_fsent, rules, nodememo)

            # swap i,j 
            self.adjust_spans(flen, fwlen)


        ## lhuang: the following is from hope.py
        ## be very careful about weights interpolation
        sg = sgml.Sentence(fsent)
        sg.fwords = fsent
        sg.refs = [refline.split(" ") for refline in reflines]

        if sg.refs:
            
            theoracle.input(sg, verbose=False)
            # 1-best
            self.reweight(weights)

            output(self, "1-best @ %s" % sid, onebestbleus, onebestscores)


            base_oracleweights = theoracle.make_weights(additive=True)
            # we use the in-place operations because oracleweights might be
            # a subclass of Vector

            for relative in []:#[opts.hope]:
                oracleweights = theoracle.make_weights(additive=True)
                oracleweights *= relative

                # interpolation: taking modelcost into account
                oracleweights += weights

                # compute oracle
                self.rescore(theoracle.models, oracleweights, add=True)
                # TODO: why??
                output(self, "hope%s  " % relative, hopebleus[relative], hopescores[relative])
            

        # right boundary should match sentence length (in chars)
        assert self.j == flen and self.wj == fwlen, \
               "@sentence %d, Boundary Mismatch at %s\t%s" % (opts.sentid, sid, fsent) + \
               "self.j=%d, flen=%d;  self.wj=%d, fwlen=%d" % (self.j, flen, self.wj, fwlen)        
        
        s += "%d\t%d\n" % nodememo[id(self)] + \
             self._dump(rules, deriv=self.viterbi_deriv())
        
        return s