コード例 #1
0
ファイル: moses_decoder.py プロジェクト: jungikim/sbmt
def make_forest(fieldss):
    nodes = {}
    goal_ids = set()
    for fields in fieldss:
        node_id = fields['hyp']
        if node_id not in nodes:
            nodes[node_id] = forest.Item(sym.fromtag('PHRASE'), 0, 0, [])
        node = nodes[node_id]

        if node_id == 0:
            r = rule.Rule(sym.fromtag('PHRASE'), rule.Phrase([]), rule.Phrase([]))
            node.deds.append(forest.Deduction((), r, svector.Vector()))
        else:
            m = scores_re.match(fields['scores'])
            core_values = [float(x) for x in m.group(1).split(',')]
            dcost = svector.Vector(m.group(2).encode('utf8'))
            for i, x in enumerate(core_values):
                dcost["_core%d" % i] = x

            back = int(fields['back'])
            ant = nodes[back]
            f = fields['src-phrase'].encode('utf8').split()
            e = fields['tgt-phrase'].encode('utf8').split()
            if len(f) != int(fields['cover-end']) - int(fields['cover-start']) + 1:
                sys.stderr.write("warning: French phrase length didn't match covered length\n")

            f = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + f)
            e = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + e)
            r = rule.Rule(sym.fromtag('PHRASE'), f, e)

            ded = forest.Deduction((ant,), r, dcost)
            node.deds.append(ded)

            if int(fields['forward']) < 0: # goal
                goal_ids.add(node_id)

    goal = forest.Item(None, 0, 0, [])
    for node_id in goal_ids:
        goal.deds.append(forest.Deduction((nodes[node_id],), None, svector.Vector()))
    return goal
コード例 #2
0
ファイル: extractor.py プロジェクト: jungikim/sbmt
            sub_j2 -= j1

            # Check English holes
            # can't lie outside phrase
            if sub_j1 < 0 or sub_j2 > elen:
                return None

            # can't overlap
            for j in xrange(sub_j1, sub_j2):
                if type(ewords[j]) is tuple or ewords[j] is None:
                    return None

            # Set first eword to var, rest to None

            # We'll clean up the Nones later
            v = sym.setindex(sub_x, index)
            fwords[i] = v
            fpos[i] = (sub_i1, sub_i2)
            ewords[sub_j1] = (v, sub_j1 + j1, sub_j2 + j1)
            for j in xrange(sub_j1 + 1, sub_j2):
                ewords[j] = None
            index += 1

    # Require an aligned French word
    if opts.require_aligned_terminal and not flag:
        return None

    epos = []
    new_ewords = []
    for i in xrange(elen):
        if ewords[i] is not None:
コード例 #3
0
ファイル: convert_forest.py プロジェクト: rupenp/transforest
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]):
    """Currently this assumes that the only frontier nodes in the tree are words."""
    while True:
        try:
            tok = tokiter.next()
            toktype = tok[0]
        except StopIteration:
            raise TreeFormatException("incomplete tree")

        if toktype == "or":
            _, nodeid = tok
            deds = list(forest_from_text_helper(tokiter, memo, \
                                                delete_words=delete_words))
            node = Item(dummylabel, dummyi, dummyj, deds=deds)
            if nodeid:
                memo[nodeid] = node
                node.nodeid = nodeid
            yield node

        elif toktype == "nonterm":
            _, nodeid, ruleid, dcoststr = tok
            if ruleid == "":
                ruleid = dummylabel
            else:
                # lhuang: N.B.: sym.fromtag would re-alloc it
                xrs_ruleid = int(ruleid)
                ruleid = sym.fromtag(ruleid)  #int(ruleid) #
                
            dcost = svector.Vector()
            if dcoststr:
                # lhuang: features are read from forest, not rules
                # so there is no "e^..." or "10^..."
                
                for fv in dcoststr.split(','):
                    f,v = fv.split(':',1)
                    v = float(v)
                    dcost[f] = v

            ants = []
            rhs = []
            vi = 1
            for child in forest_from_text_helper(tokiter, memo, want_item=True,\
                                                 delete_words=delete_words):
                if isinstance(child, Item):
                    ants.append(child)
                    rhs.append(sym.setindex(dummylabel, vi))
                    vi += 1
                else:
                    rhs.append(child)
            r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs))

            node = Deduction(ants=ants, rule=r, dcost=dcost)
            node.ruleid = xrs_ruleid
            
            if want_item: # need to insert OR node
                node = Item(dummylabel, dummyi, dummyj, deds=[node])
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == 'term':
            terminal = tok[1]
            if terminal not in delete_words:
                yield sym.fromstring(terminal)

        elif toktype == 'ref':
            yield memo[tok[1]]

        elif toktype == 'pop':
            return

        else:
            raise TreeFormatException("unknown token %s" % (tok,))
コード例 #4
0
ファイル: convert_forest.py プロジェクト: rupenp/transforest
    def dump(self, rules=None, sid=1, fsent="<foreign-sentence>", byline="", reflines=[]):

        nodememo = {}   # to keep track of sizes (# of nodes, # of edges)
        # forest id, foreign sentence (TODO: refs)

        fsent = fsent.split(" ")

        s = "%s\t%s\n" % (sid, " ".join(fsent)) + \
            "%d\n" % len(reflines) + \
            "".join(reflines)


        flen = len(words_to_chars(fsent, encode_back=True))        
        fwlen = len(fsent)

        reversed_fsent = list(reversed(fsent))  ## RIGHT TO LEFT
        
        if byline != "":
            self.traverse(0, 0, reversed_fsent, rules, nodememo)
            ## swap back
            self.adjust_spans(flen, fwlen)

            byline = byline.split(" ")
            byline_flen = self.i
            byline_fwlen = self.wi
            byline_f = fsent[:byline_fwlen]

            print >> logs, "clen (non-byline) = %d (%d)" % (flen, self.j - self.i)
            print >> logs, "wlen (non-byline) = %d (%d)" % (fwlen, self.wj - self.wi)            
            print >> logs, "BYLINE = " + " ".join(byline_f) + \
                  " ### %d chars, %d words" % (byline_flen, byline_fwlen)

            assert len(words_to_chars(byline_f)) == byline_flen, "@sentence %d, BYLINE Error" % opts.sentid ## check consistency

            ## new rule/edge
            ## TOP("by" "line" x0:TOP) -> "BY" "LINE" x0 ### id=-1

            byline_e = " ".join('"%s"' % w for w in byline)
            lhs = "TOP(" + byline_e + " x0:%s)" % self.x  # "TOP"
            rhs = " ".join('"%s"' % w for w in byline_f) + " x0"
            # byline rule, id=-1
            rid = -1
            rules[rid] = "%s -> %s ### id=%d" % (lhs, rhs, rid)

            ## make david-style LHS
            david_lhs = []
            for w in byline:
                david_lhs.append(sym.fromstring(w))
            david_lhs.append(sym.setindex(dummylabel, 1))
            
            ded = Deduction([self], rule.Rule(rid, rule.Phrase(david_lhs), rule.Phrase(david_lhs)),\
                            svector.Vector())
            ded.lhsstr = byline_e.split() + [self] ## N.B.: dont forget "..."
            ded.ruleid = rid
            
            # new node on top of TOP
            oldtop = self
            self = Item(self.x, 0, flen, deds=[ded])
            self.x = oldtop.x
            self.wi = 0
            self.wj = fwlen
            self.id = len(nodememo)+1
            nodememo[id(self)] = (self.id, nodememo[id(oldtop)][1]+1) #edges


            
        else:
            # establish node spans 
            self.traverse(0, 0, reversed_fsent, rules, nodememo)

            # swap i,j 
            self.adjust_spans(flen, fwlen)


        ## lhuang: the following is from hope.py
        ## be very careful about weights interpolation
        sg = sgml.Sentence(fsent)
        sg.fwords = fsent
        sg.refs = [refline.split(" ") for refline in reflines]

        if sg.refs:
            
            theoracle.input(sg, verbose=False)
            # 1-best
            self.reweight(weights)

            output(self, "1-best @ %s" % sid, onebestbleus, onebestscores)


            base_oracleweights = theoracle.make_weights(additive=True)
            # we use the in-place operations because oracleweights might be
            # a subclass of Vector

            for relative in []:#[opts.hope]:
                oracleweights = theoracle.make_weights(additive=True)
                oracleweights *= relative

                # interpolation: taking modelcost into account
                oracleweights += weights

                # compute oracle
                self.rescore(theoracle.models, oracleweights, add=True)
                # TODO: why??
                output(self, "hope%s  " % relative, hopebleus[relative], hopescores[relative])
            

        # right boundary should match sentence length (in chars)
        assert self.j == flen and self.wj == fwlen, \
               "@sentence %d, Boundary Mismatch at %s\t%s" % (opts.sentid, sid, fsent) + \
               "self.j=%d, flen=%d;  self.wj=%d, fwlen=%d" % (self.j, flen, self.wj, fwlen)        
        
        s += "%d\t%d\n" % nodememo[id(self)] + \
             self._dump(rules, deriv=self.viterbi_deriv())
        
        return s        
コード例 #5
0
            sub_j2 -= j1

            # Check English holes
            # can't lie outside phrase
            if sub_j1 < 0 or sub_j2 > elen:
                return None

            # can't overlap
            for j in xrange(sub_j1,sub_j2):
                if type(ewords[j]) is tuple or ewords[j] is None:
                    return None

            # Set first eword to var, rest to None

            # We'll clean up the Nones later
            v = sym.setindex(sub_x, index)
            fwords[i] = v
            fpos[i] = (sub_i1,sub_i2)
            ewords[sub_j1] = (v, sub_j1+j1, sub_j2+j1)
            for j in xrange(sub_j1+1,sub_j2):
                ewords[j] = None
            index += 1

    # Require an aligned French word
    if opts.require_aligned_terminal and not flag:
        return None

    epos = []
    new_ewords = []
    for i in xrange(elen):
        if ewords[i] is not None:
コード例 #6
0
ファイル: forest.py プロジェクト: jungikim/sbmt
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]):
    """Currently this assumes that the only frontier nodes in the tree are words."""
    while True:
        try:
            tok = tokiter.next()
            toktype = tok[0]
        except StopIteration:
            raise TreeFormatException("incomplete tree")

        if toktype == "or":
            _, nodeid = tok
            deds = list(
                forest_from_text_helper(tokiter,
                                        memo,
                                        delete_words=delete_words))
            node = Item(dummylabel, dummyi, dummyj, deds=deds)
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == "nonterm":
            _, nodeid, ruleid, dcoststr = tok
            if ruleid == "":
                ruleid = dummylabel
            else:
                ruleid = sym.fromtag(ruleid)
            dcost = svector.Vector()
            if dcoststr:
                for fv in dcoststr.split(','):
                    f, v = fv.split(':', 1)
                    v = float(v)
                    dcost[f] = v

            ants = []
            rhs = []
            vi = 1
            for child in forest_from_text_helper(tokiter,
                                                 memo,
                                                 want_item=True,
                                                 delete_words=delete_words):
                if isinstance(child, Item):
                    ants.append(child)
                    rhs.append(sym.setindex(dummylabel, vi))
                    vi += 1
                else:
                    rhs.append(child)
            r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs))

            node = Deduction(ants=ants, rule=r, dcost=dcost)
            if want_item:  # need to insert OR node
                node = Item(dummylabel, dummyi, dummyj, deds=[node])
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == 'term':
            terminal = tok[1]
            if terminal not in delete_words:
                yield sym.fromstring(terminal)

        elif toktype == 'ref':
            yield memo[tok[1]]

        elif toktype == 'pop':
            return

        else:
            raise TreeFormatException("unknown token %s" % (tok, ))