Esempio n. 1
0
    def seed(self, input, grammars, models, weights):
        fwords = [sym.fromstring(f) for f in input.fwords]
        self.models = models
        self.weights = weights

        # Seed the dotchart. This will give the extracted rules

        self.grammars = [(g, DotChart(self, fwords)) for g in grammars if isinstance(g, Grammar)]

        for (g,dotchart) in self.grammars:
            for i in xrange(self.n):
                if g.filterspan(i,i,self.n):
                    dotchart.add(g.root,i,i,())
                    self.dot_added += 1

        for g in grammars:
            if isinstance(g, NewGrammar):
                g.input(input)
                for i in xrange(self.n):
                    for j in xrange(i+1,self.n+1):
                        for (r,) in g.get_rules(i,j):
                            estimate_rule(r, models, weights)
                            self.add_axiom(i, j, r)

        # Last resort for unknown French word: pass it through
        for i in xrange(0, len(fwords)):
            for x in self.default_nonterminals:
                r = rule.Rule(x,
                              rule.Phrase(fwords[i:i+1]),
                              rule.Phrase(fwords[i:i+1]),
                              scores=svector.Vector('unknown', 1.))
                estimate_rule(r, models, weights)
                self.add_axiom(i, i+1, r)
Esempio n. 2
0
    def seed(self, flattice, grammars, models, weights):
        self.models = models
        self.weights = weights

        # Seed the dotchart. This will give the extracted rules

        self.grammars = [(g, DotChart(self, flattice)) for g in grammars if isinstance(g, Grammar)]

        for (g,dotchart) in self.grammars:
            for i in xrange(self.flattice.n-1):
                if g.filterspan(self.flattice, i, i):
                    dotchart.add(g.root,i,i,())
                    self.dot_added += 1

        for g in grammars:
            if isinstance(g, NewGrammar):
                g.input(flattice)
                for i in xrange(self.flattice.n-1):
                    for j in xrange(i+1,self.flattice.n):
                        for (r,) in g.get_rules(i,j):
                            estimate_rule(r, models, weights)
                            self.add_axiom(i, j, r)

        # Last resort for unknown French word: pass it through
        for edge in flattice.edges:
            for x in self.default_nonterminals:
                r = rule.Rule(x,
                              rule.Phrase([edge.w]),
                              rule.Phrase([edge.w]),
                              scores=svector.Vector('unknown', 1.))
                estimate_rule(r, models, weights)
                self.add_axiom(edge.i, edge.j, r)
Esempio n. 3
0
    def input(self, input):
        self.rules = collections.defaultdict(list)
        for tag, attrs, i, j in input.fmeta:
            attrs = sgml.attrs_to_dict(attrs)
            if attrs.has_key('english'):
                ephrases = attrs['english'].split('|')

                if attrs.has_key('cost'):
                    costs = [float(x) for x in attrs['cost'].split('|')]
                elif attrs.has_key('prob'):
                    costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')]
                else:
                    costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform
                if len(costs) != len(ephrases):
                    sys.stderr.write("wrong number of probabilities/costs")
                    raise ValueError

                if attrs.has_key('features'):
                    features = attrs['features'].split('|')
                    if len(features) != len(ephrases):
                        sys.stderr.write("wrong number of feature names")
                        raise ValueError
                elif attrs.has_key('feature'):
                    features = [attrs['feature'] for ephrase in ephrases]
                else:
                    features = ['sgml' for ephrase in ephrases]

                if attrs.has_key('label'):
                    tags = attrs['label'].split('|')
                else:
                    tags = [tag.upper()]

                # bug: if new nonterminals are introduced at this point,
                # they will not participate in the topological sort

                for (ephrase,cost,feature) in zip(ephrases,costs,features):
                    for tag in tags:
                        r = rule.Rule(sym.fromtag(tag),
                                      rule.Phrase(input.fwords[i:j]),
                                      rule.Phrase([sym.fromstring(e) for e in ephrase.split()]),
                                      scores=svector.Vector('%s' % feature, cost))
                        self.rules[i,j].append((r,))
Esempio n. 4
0
    def input(self, lat):
        self.rules = collections.defaultdict(list)
        for span in lat.spans:
            i, j = span.i, span.j

            if hasattr(span, 'v'):
                v = svector.Vector(span.v)
            else:
                v = model.zero

            # bug: if new nonterminals are introduced at this point,
            # they will not participate in the topological sort

            r = rule.Rule(sym.fromtag(span.x),
                          rule.Phrase([sym.fromstring(f) for f in span.f]),
                          rule.Phrase([sym.fromstring(e) for e in span.e]),
                          scores=v)
            self.rules[i,j].append((r,))
            if log.level >= 2:
                log.write("added lattice rule at (%d,%d): %s\n" % (i,j,r))
Esempio n. 5
0
def make_forest(fieldss):
    nodes = {}
    goal_ids = set()
    for fields in fieldss:
        node_id = fields['hyp']
        if node_id not in nodes:
            nodes[node_id] = forest.Item(sym.fromtag('PHRASE'), 0, 0, [])
        node = nodes[node_id]

        if node_id == 0:
            r = rule.Rule(sym.fromtag('PHRASE'), rule.Phrase([]), rule.Phrase([]))
            node.deds.append(forest.Deduction((), r, svector.Vector()))
        else:
            m = scores_re.match(fields['scores'])
            core_values = [float(x) for x in m.group(1).split(',')]
            dcost = svector.Vector(m.group(2).encode('utf8'))
            for i, x in enumerate(core_values):
                dcost["_core%d" % i] = x

            back = int(fields['back'])
            ant = nodes[back]
            f = fields['src-phrase'].encode('utf8').split()
            e = fields['tgt-phrase'].encode('utf8').split()
            if len(f) != int(fields['cover-end']) - int(fields['cover-start']) + 1:
                sys.stderr.write("warning: French phrase length didn't match covered length\n")

            f = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + f)
            e = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + e)
            r = rule.Rule(sym.fromtag('PHRASE'), f, e)

            ded = forest.Deduction((ant,), r, dcost)
            node.deds.append(ded)

            if int(fields['forward']) < 0: # goal
                goal_ids.add(node_id)

    goal = forest.Item(None, 0, 0, [])
    for node_id in goal_ids:
        goal.deds.append(forest.Deduction((nodes[node_id],), None, svector.Vector()))
    return goal
Esempio n. 6
0
def output(f):
    deriv = f.viterbi_deriv()
    hypv = deriv.vector()
    hyp = deriv.english()
    return "hyp={{{%s}}} derivation={{{%s}}} %s" % (" ".join(sym.tostring(e) for e in hyp), deriv, hypv)

for srcline, forestline, reflines in itertools.izip(srcfile, forestfile, itertools.izip(*reffiles)):
    f = forest.forest_from_text(forestline)

    # the oracle needs to know how long all the French spans are
    for item in f.bottomup():
        for ded in item.deds:
            # replace rule's French side with correct number of French words
            # we don't even bother to use the right number of variables
            ded.rule = rule.Rule(ded.rule.lhs,
                                 rule.Phrase([sym.fromstring('<foreign-word>')]*int(ded.dcost['foreign-length'])),
                                 ded.rule.e)

    f.reweight(weights)
    print "1-best %s" % output(f)

    s = sgml.Sentence(srcline.split())
    s.fwords = srcline.split()
    s.refs = [refline.split() for refline in reflines]
    theoracle.input(s, verbose=False)

    oracleweights = theoracle.make_weights(additive=True)
    # we use the in-place operations because oracleweights might be
    # a subclass of Vector
    oracleweights *= -1
    oracleweights += weights
Esempio n. 7
0
    for i in xrange(elen):
        if ewords[i] is not None:
            if type(ewords[i]) is tuple:
                (v, ei, ej) = ewords[i]
                # force slash categories to be at left edge of English side
                if force_english_prefix and len(
                        new_ewords) != 0 and sym.clearindex(
                            v) in prefix_labels:
                    return None
                new_ewords.append(v)
                epos.append((ei, ej))
            else:
                new_ewords.append(ewords[i])
                epos.append(i + j1)

    r = XRule(x, rule.Phrase(tuple(fwords)), rule.Phrase(tuple(new_ewords)))
    r.fpos = fpos
    r.epos = epos
    r.span = (i1, i2, j1, j2)

    if opts.keep_word_alignments:
        r.word_alignments = []
        for fi in xrange(len(fpos)):
            if type(fpos[fi]) is int:
                for ei in xrange(len(epos)):
                    if type(epos[ei]) is int:
                        if a.aligned[fpos[fi]][epos[ei]]:
                            r.word_alignments.append((fi, ei))
    return r

Esempio n. 8
0
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]):
    """Currently this assumes that the only frontier nodes in the tree are words."""
    while True:
        try:
            tok = tokiter.next()
            toktype = tok[0]
        except StopIteration:
            raise TreeFormatException("incomplete tree")

        if toktype == "or":
            _, nodeid = tok
            deds = list(
                forest_from_text_helper(tokiter,
                                        memo,
                                        delete_words=delete_words))
            node = Item(dummylabel, dummyi, dummyj, deds=deds)
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == "nonterm":
            _, nodeid, ruleid, dcoststr = tok
            if ruleid == "":
                ruleid = dummylabel
            else:
                ruleid = rule.Nonterminal(ruleid)
            dcost = svector.Vector()
            if dcoststr:
                for fv in dcoststr.split(','):
                    f, v = fv.split(':', 1)
                    v = float(v)
                    dcost[f] = v

            ants = []
            rhs = []
            vi = 1
            for child in forest_from_text_helper(tokiter,
                                                 memo,
                                                 want_item=True,
                                                 delete_words=delete_words):
                if isinstance(child, Item):
                    ants.append(child)
                    rhs.append(dummylabel.setindex(vi))
                    vi += 1
                else:
                    rhs.append(child)
            r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs))

            node = Deduction(ants=ants, rule=r, dcost=dcost)
            if want_item:  # need to insert OR node
                node = Item(dummylabel, dummyi, dummyj, deds=[node])
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == 'term':
            terminal = tok[1]
            if terminal not in delete_words:
                yield terminal

        elif toktype == 'ref':
            yield memo[tok[1]]

        elif toktype == 'pop':
            return

        else:
            raise TreeFormatException("unknown token %s" % (tok, ))
Esempio n. 9
0
    def translate(self, input):
        """input: any object that has an attribute 'words' which is a list of numberized French words. and an 'id' attribute. and an 'instruction' attribute
           output: a forest"""

        if self.decoder_age >= 100:
            self.start_decoder()

        restarts = 0
        self.decoder_age += 1
        outforest = ""
        while restarts <= 3:
            try:
                self.send_weights(input=input)
                outforest = self.instruct(input)
                if outforest == "" or not self.create_forest(
                        outforest) or self.child.poll() is not None:
                    continue
                else:
                    break
                # graehl->pust: careful - restarts += 1 doesn't happen on continue. infinite loop possible if decoder really outputs no forest (I think you changed it so a dummy forest is output, so this may be what you want? just bad for error reporting if you hang forever)
            except:
                lastexcept = log.strexcept(True)
                log.writeln("CAUGHT exception: %s" % lastexcept)
                pass
            restarts += 1
            if restarts <= 3:
                log.writeln("restarting decoder")
                self.start_decoder()
            else:
                self.start_decoder()
                #raise Exception("too many decoder restarts for %s, giving up - last was: %s"%(input,lastexcept))
                #don't raise because of global 100-retries limit in trainer.py
                log.write(
                    "too many decoder restarts, giving up on exception %s:\n%s\nwith weights:\n%s\n"
                    % (lastexcept, repr(input), self.weights))
                self.create_forest("(0<noparse:1> )")

        # self.send_instruction('weights diff "%s";' % weightstring, input)
        # self.oldweights = svector.Vector(self.weights)

        # self.send_instruction(input.instruction,input)
        # outforest = self.child.recvline()

        # restarts = 0
        # while outforest == "" or self.child.poll() is not None:
        #     log.writeln("restarting decoder")
        #     self.start_decoder()
        #     if restarts > 3:
        #         raise Exception("too many decoder restarts, giving up")
        #     self.send_instruction('weights "%s";' % weightstring, input)
        #     self.send_instruction(input.instruction, input)
        #     outforest = self.child.recvline()
        #     restarts += 1

        log.writeln("received forest: %s...%s for %s" %
                    (outforest[:80], outforest[-80:], input))
        #sys.stderr.write("received forest: %s\n" % (outforest,))

        # try:
        #     f = forest.forest_from_text(outforest, delete_words=['@UNKNOWN@'])
        # except forest.TreeFormatException:
        #     badforestf='%s/badforest.%s'%(badforestdir,input.id)
        #     log.write("dumping bad forest to %s\n" % (badforestf,))
        #     forestfile = file(badforestf, "w")
        #     forestfile.write(outforest)
        #     forestfile.close()
        #     raise

        f = self.forest
        self.forest = None
        #sys.stderr.write("internal forest: %s\n" % (forest.forest_to_text(f, mode='english')))

        for item in f.bottomup():
            for ded in item.deds:
                # replace rule's French side with correct number of French words
                # we don't even bother to use the right number of variables
                ded.rule = rule.Rule(
                    ded.rule.lhs,
                    rule.Phrase([sym.fromstring('<foreign-word>')] *
                                int(ded.dcost['foreign-length'])), ded.rule.e)

                for feature in delete_features:
                    del ded.dcost[feature]

        f.reweight(
            self.weights)  # because forest_from_text doesn't compute viterbi

        return f