Beispiel #1
0
def add_constituent_prefixes(a, ephrase_index):
    """if a phrase is a prefix of a constituent, give it a fake label"""
    if log.level >= 3:
        log.write(
            str([(i, j, sym.tostring(x))
                 for ((i, j), l) in a.espans.iteritems() for x in l]))
        log.write("\n")

    ei_index = {}
    for ((ei, ej), labels) in a.espans.iteritems():
        ei_index.setdefault(ei, []).extend([(ej, x) for x in reversed(labels)])
    for ei in ei_index.iterkeys():
        ei_index[ei].sort()  # stable

    for (ei, ej) in ephrase_index:
        if True or not (a.espans.has_key(
            (ei, ej)) and len(a.espans[ei, ej]) > 0):
            for (ej1, x) in ei_index.get(ei, []):
                if ej1 > ej:
                    x1 = sym.fromtag(sym.totag(x) + "*")
                    a.espans.setdefault((ei, ej), []).append(x1)
                    prefix_labels.add(x1)
                    break

    if log.level >= 3:
        log.write(
            str([(i, j, sym.tostring(x))
                 for ((i, j), l) in a.espans.iteritems() for x in l]))
        log.write("\n---\n")
Beispiel #2
0
 def feature(fphrase, ephrase, paircount, fcount, fsample_count):
     fwords = (sym.tostring(w) for w in fphrase if not sym.isvar(w))
     ewords = [sym.tostring(w) for w in ephrase if not sym.isvar(w)] + ['NULL']
     def score():
         for f in fwords:
           maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
           yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
     return sum(score())
Beispiel #3
0
 def write(self, file):
     '''Write in GIZA++ format'''
     file.write("%s\n" % self.comment)
     file.write("%s\n" % " ".join([sym.tostring(word) for word in self.ewords]))
     output = []
     output += ['NULL','({']+[str(j+1) for j in xrange(len(self.ewords)) if not self.ealigned[j]]+['})']
     for i in xrange(len(self.fwords)):
         output += [sym.tostring(self.fwords[i]),'({']+[str(j+1) for j in xrange(len(self.aligned[i])) if self.aligned[i][j]]+['})']
     file.write("%s\n" % " ".join(output))
Beispiel #4
0
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None):
    result = []
    result.append('{\n')

    if fwords:
        fwords = [(sym.tostring(fword) if type(fword) is int else fword)
                  for fword in fwords]
        result.append('  "source": [%s],\n' %
                      ",".join(quotejson(fword) for fword in fwords))

    items = list(root)
    nodeindex = {}
    nodestrs = []
    for ni, item in enumerate(items):
        nodeindex[item] = ni
        if item is root:
            ri = ni
        if item.x is None:
            nodestrs.append('    {}')
        else:
            nodestrs.append('    {"label": %s}' % quotejson(sym.totag(item.x)))
    result.append('  "nodes": [\n%s\n  ],\n' % ",\n".join(nodestrs))

    result.append('  "root": %d,\n' % ri)

    edgestrs = []
    for ni, item in enumerate(items):
        for ded in item.deds:
            tailstrs = []

            if mode == 'french':
                children = ded.rule.f if ded.rule else ded.ants
            elif mode == 'english':
                children = ded.rule.e if ded.rule else ded.ants
            else:
                children = ded.ants

            for child in children:
                if isinstance(child, Item):
                    tailstrs.append(str(nodeindex[child]))
                elif sym.isvar(child):
                    ant = ded.ants[sym.getindex(child) - 1]
                    tailstrs.append(str(nodeindex[ant]))
                else:
                    tailstrs.append(quotejson(sym.tostring(child)))

            dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f), v)
                                         for (f, v) in ded.dcost.iteritems())
            edgestrs.append(
                '    {"head": %s, "tails": [%s], "features": %s}\n' %
                (ni, ",".join(tailstrs), dcoststr))

    result.append('  "edges": [\n%s\n  ]\n' % ",\n".join(edgestrs))

    result.append('}')
    return "".join(result)
def add_bounded_prefixes(a, ephrases, etree):
    if log.level >= 3:
        log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ]))
        log.write("\n")

    add_bounded_prefixes_helper(a, ephrases, etree, 0, [])

    if log.level >= 3:
        log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ]))
        log.write("\n---\n")
Beispiel #6
0
def forest_to_json(root, fwords=None, mode=None, models=None, weights=None):
    result = []
    result.append('{\n')

    if fwords:
        fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords]
        result.append('  "source": [%s],\n' % ",".join(quotejson(fword) for fword in fwords))

    items = list(root)
    nodeindex = {}
    nodestrs = []
    for ni,item in enumerate(items):
        nodeindex[item] = ni
        if item is root:
            ri = ni
        if item.x is None:
            nodestrs.append('    {}')
        else:
            nodestrs.append('    {"label": %s}' % quotejson(sym.totag(item.x)))
    result.append('  "nodes": [\n%s\n  ],\n' % ",\n".join(nodestrs))

    result.append('  "root": %d,\n' % ri)

    edgestrs = []
    for ni,item in enumerate(items):
        for ded in item.deds:
            tailstrs = []

            if mode == 'french':
                children = ded.rule.f if ded.rule else ded.ants
            elif mode == 'english':
                children = ded.rule.e if ded.rule else ded.ants
            else:
                children = ded.ants

            for child in children:
                if isinstance(child, Item):
                    tailstrs.append(str(nodeindex[child]))
                elif sym.isvar(child):
                    ant = ded.ants[sym.getindex(child)-1]
                    tailstrs.append(str(nodeindex[ant]))
                else:
                    tailstrs.append(quotejson(sym.tostring(child)))

            dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f),v) for (f,v) in ded.dcost.iteritems())
            edgestrs.append('    {"head": %s, "tails": [%s], "features": %s}\n' % (
                    ni,
                    ",".join(tailstrs),
                    dcoststr))

    result.append('  "edges": [\n%s\n  ]\n' % ",\n".join(edgestrs))

    result.append('}')
    return "".join(result)
Beispiel #7
0
def add_bounded_prefixes(a, ephrases, etree):
    if log.level >= 3:
        log.write(
            str([(i, j, sym.tostring(x))
                 for ((i, j), l) in a.espans.iteritems() for x in l]))
        log.write("\n")

    add_bounded_prefixes_helper(a, ephrases, etree, 0, [])

    if log.level >= 3:
        log.write(
            str([(i, j, sym.tostring(x))
                 for ((i, j), l) in a.espans.iteritems() for x in l]))
        log.write("\n---\n")
Beispiel #8
0
def _ded_to_xml(node, result, memo, mode, models, weights):
    if weights:
        result.append('<and label=%s cost=%s>' % (xml.sax.saxutils.quoteattr(str(id(node.rule))),
                                                  xml.sax.saxutils.quoteattr(str(weights.dot(node.dcost)))))
    else:
        result.append('<and label=%s>' % (xml.sax.saxutils.quoteattr(str(id(node)))))

    result.append('<features>')
    for f,v in node.dcost.iteritems():
        result.append('<feature name=%s value=%s/>' % (xml.sax.saxutils.quoteattr(f), xml.sax.saxutils.quoteattr(str(v))))
    result.append('</features>')

    if mode == 'french':
        children = node.rule.f if node.rule else node.ants
    elif mode == 'english':
        children = node.rule.e if node.rule else node.ants
    else:
        children = node.ants

    for child in children:
        if isinstance(child, Item):
            _item_to_xml(child, result, memo, mode=mode, models=models, weights=weights)
        elif sym.isvar(child):
            _item_to_xml(node.ants[sym.getindex(child)-1], result, memo, mode=mode, models=models, weights=weights)
        else:
            result.append('<leaf label=%s/>' % xml.sax.saxutils.quoteattr(sym.tostring(child)))
    result.append('</and>')
Beispiel #9
0
def _ded_to_text(node, result, memo, mode=None, weights=None):
    # Convert rule and features into single tokens
    #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost)
    vstr = "cost:%s" % weights.dot(node.dcost)
    #rstr = id(node.rule)
    rstr = id(node)
    s = "%s<%s>" % (rstr,vstr)
    if False and len(node.ants) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id
        result.append(s)
    else:
        result.append('(')
        result.append(s)
        if mode == 'french':
            children = node.rule.f if node.rule else node.ants
        elif mode == 'english':
            children = node.rule.e if node.rule else node.ants
        else:
            children = node.ants

        for child in children:
            if isinstance(child, Item):
                result.append(' ')
                _item_to_text(child, result, memo, mode=mode, weights=weights)
            elif sym.isvar(child):
                result.append(' ')
                _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights)
            else:
                result.append(' ')
                result.append(quoteattr(sym.tostring(child)))
        result.append(')')
Beispiel #10
0
def get_hyps(sent, goal, weights):
    """Assumes that oraclemodel.input() has been called"""
    # worst violators

    oracleweights = theoracle.make_weights(additive=True)
    # we use the in-place operations because oracleweights might be
    # a subclass of Vector
    oracleweights *= fear_weight
    oracleweights += weights

    goal.reweight(oracleweights)

    hyps = decoder.get_nbest(goal, 1, 1)
    result = []

    for hypv, hyp in hyps:
        hypscore = get_score(hypv, hyp)
        log.write(
            "added new hyp: %s %s cost=%s score=%s\n"
            % (" ".join(sym.tostring(e) for e in hyp), hypv, weights.dot(hypv), hypscore)
        )

        # the learner MUST not see the oracle features
        hypv = theoracle.clean(hypv)

        result.append((hypv, hyp, hypscore))

    return result
Beispiel #11
0
def add_sister_prefixes(a, ephrases, etree):
    if log.level >= 3:
        log.write("phrases before filtering:\n")
        for (i,j) in ephrases:
            log.write("%s" % ((i,j),))
        log.write("constituents before adding:\n")
        for ((i,j),l) in a.espans.iteritems():
            log.write("%s %s\n" % ((i,j),[sym.tostring(x) for x in l]))

    add_sister_prefixes_helper(a, ephrases, etree, 0)

    if log.level >= 3:
        log.write("constituents after adding:\n")
        for ((i,j),l) in a.espans.iteritems():
            log.write("%s %s\n" % ((i,j),[sym.tostring(x) for x in l]))
        log.write("\n---\n")
Beispiel #12
0
def get_hyps(sent, goal, weights):
    """Assumes that oraclemodel.input() has been called"""
    # worst violators

    oracleweights = theoracle.make_weights(additive=True)
    # we use the in-place operations because oracleweights might be
    # a subclass of Vector
    oracleweights *= fear_weight
    oracleweights += weights

    goal.reweight(oracleweights)

    hyps = decoder.get_nbest(goal, 1, 1)
    result = []

    for hypv, hyp in hyps:
        hypscore = get_score(hypv, hyp)
        log.write("added new hyp: %s %s cost=%s score=%s\n" %
                  (" ".join(sym.tostring(e)
                            for e in hyp), hypv, weights.dot(hypv), hypscore))

        # the learner MUST not see the oracle features
        hypv = theoracle.clean(hypv)

        result.append((hypv, hyp, hypscore))

    return result
Beispiel #13
0
def add_sister_prefixes(a, ephrases, etree):
    if log.level >= 3:
        log.write("phrases before filtering:\n")
        for (i, j) in ephrases:
            log.write("%s" % ((i, j), ))
        log.write("constituents before adding:\n")
        for ((i, j), l) in a.espans.iteritems():
            log.write("%s %s\n" % ((i, j), [sym.tostring(x) for x in l]))

    add_sister_prefixes_helper(a, ephrases, etree, 0)

    if log.level >= 3:
        log.write("constituents after adding:\n")
        for ((i, j), l) in a.espans.iteritems():
            log.write("%s %s\n" % ((i, j), [sym.tostring(x) for x in l]))
        log.write("\n---\n")
Beispiel #14
0
 def _fake_tree_helper(lhs, rhs, antvalues):
     children = []
     for x in rhs:
         if sym.isvar(x):
             children.append(antvalues[sym.getindex(x)-1])
         else:
             children.append(tree.Node(sym.tostring(x), []))
     return tree.Node(sym.totag(lhs), children)
Beispiel #15
0
 def _fake_tree_helper(lhs, rhs, antvalues):
     children = []
     for x in rhs:
         if sym.isvar(x):
             children.append(antvalues[sym.getindex(x) - 1])
         else:
             children.append(tree.Node(sym.tostring(x), []))
     return tree.Node(sym.totag(lhs), children)
Beispiel #16
0
 def __str__(self):
     r = "conf net: %d\n" % (len(self.columns),)
     i = 0
     for col in self.columns:
         r += "%d -- " % i
         i += 1
         for alternative in col:
             r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2])
         r += "\n"
     return r
Beispiel #17
0
def forest_to_xml(node, fwords=None, mode=None, models=None, weights=None):
    result = []
    result.append('<forest>')
    if fwords:
        fwords = [(sym.tostring(fword) if type(fword) is int else fword) for fword in fwords]
        result.append('<source>%s</source>' % " ".join(fwords))

    _item_to_xml(node, result, {}, mode=mode, models=models, weights=weights)
    result.append('</forest>')
    return "".join(result)
Beispiel #18
0
def forest_to_xml(node, fwords=None, mode=None, models=None, weights=None):
    result = []
    result.append('<forest>')
    if fwords:
        fwords = [(sym.tostring(fword) if type(fword) is int else fword)
                  for fword in fwords]
        result.append('<source>%s</source>' % " ".join(fwords))

    _item_to_xml(node, result, {}, mode=mode, models=models, weights=weights)
    result.append('</forest>')
    return "".join(result)
Beispiel #19
0
    def process_heldout(sent):
        theoracle.input(sent)

        log.write("done preparing\n")

        global decoder_errors
        try:
            goal = thedecoder.translate(sent)
            thedecoder.process_output(sent, goal)
            decoder_errors = 0
            if goal is None:
                raise Exception("parse failure")
        except Exception:
            import traceback

            log.writeln(
                "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info())))
            )
            decoder_errors += 1
            if decoder_errors >= 100:
                log.write("decoder failed too many times, passing exception through!\n")
                raise
            else:
                return

        goal.rescore(theoracle.models, thedecoder.weights, add=True)

        bestv, best = decoder.get_nbest(goal, 1)[0]
        log.write("done decoding\n")

        bestscore = get_score(bestv, best)
        log.write(
            "best hyp: %s %s cost=%s score=%s\n"
            % (" ".join(sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore)
        )

        bestv = theoracle.finish(bestv, best)
        sent.score_comps = bestv
        sent.ewords = [sym.tostring(e) for e in best]

        return sent
Beispiel #20
0
def output(node, prompt, gbleu, gscore):
    deriv = node.viterbi_deriv()
    hyp = " ".join([sym.tostring(e) for e in deriv.english()])
    bleu = fbleu.rescore(hyp)
    score = weights.dot(deriv.vector())

    # in place!!
    gbleu += fbleu.copy()
    gscore += [score]

    print >> logs,  "%s  \tscore=%.4lf\tbleu+1=%.4lf\tlenratio=%.2lf\n%s" % \
          (prompt, score, bleu, fbleu.ratio(), hyp)            
Beispiel #21
0
    def process_heldout(sent):
        theoracle.input(sent)

        log.write("done preparing\n")

        global decoder_errors
        try:
            goal = thedecoder.translate(sent)
            thedecoder.process_output(sent, goal)
            decoder_errors = 0
            if goal is None: raise Exception("parse failure")
        except Exception:
            import traceback
            log.writeln(
                "decoder raised exception: %s %s" %
                (sent, "".join(traceback.format_exception(*sys.exc_info()))))
            decoder_errors += 1
            if decoder_errors >= 100:
                log.write(
                    "decoder failed too many times, passing exception through!\n"
                )
                raise
            else:
                return

        goal.rescore(theoracle.models, thedecoder.weights, add=True)

        bestv, best = decoder.get_nbest(goal, 1)[0]
        log.write("done decoding\n")

        bestscore = get_score(bestv, best)
        log.write("best hyp: %s %s cost=%s score=%s\n" % (" ".join(
            sym.tostring(e)
            for e in best), bestv, thedecoder.weights.dot(bestv), bestscore))

        bestv = theoracle.finish(bestv, best)
        sent.score_comps = bestv
        sent.ewords = [sym.tostring(e) for e in best]

        return sent
Beispiel #22
0
    def topological_sort(self):
        # now we do a topological sort on the unary immediate domination relation
        if log.level >= 3:
            log.write("Doing topological sort on nonterminals\n")
        self.nonterminals = []

        # make unary_children into graph
        for (x,s) in self.unary_children.items():
            for y in s:
                self.unary_children.setdefault(y, set())

        if log.level >= 3:
            for (x,s) in self.unary_children.items():
                log.write("%s -> %s\n" % (sym.tostring(x), " | ".join(sym.tostring(y) for y in s)))

        for x in sym.nonterminals():
            if not self.unary_children.has_key(x):
                self.nonterminals.append(x)

        while len(self.unary_children) > 0:
            childless = None
            for (x,s) in self.unary_children.iteritems():
                if len(s) == 0:
                    childless = x
                    break
            if childless is None:
                sys.stderr.write("cycle of unary productions detected: ")
                childless = self.unary_children.keys()[0] # arbitrary
                sys.stderr.write("breaking all unary children of %s\n" % sym.tostring(childless))
            del self.unary_children[childless]
            for (x,s) in self.unary_children.iteritems():
                s.discard(childless)
            self.nonterminals.append(childless)
        if len(self.nonterminals) < 1000 and log.level >= 3:
            log.write("Nonterminals: %s\n" % " ".join("%s=%s" % (x,sym.tostring(x)) for x in self.nonterminals))

        self.make_index()

        self.unary_children = None
Beispiel #23
0
def add_constituent_prefixes(a, ephrase_index):
    """if a phrase is a prefix of a constituent, give it a fake label"""
    if log.level >= 3:
        log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ]))
        log.write("\n")

    ei_index = {}
    for ((ei,ej),labels) in a.espans.iteritems():
        ei_index.setdefault(ei, []).extend([(ej,x) for x in reversed(labels)])
    for ei in ei_index.iterkeys():
        ei_index[ei].sort() # stable

    for (ei,ej) in ephrase_index:
        if True or not (a.espans.has_key((ei,ej)) and len(a.espans[ei,ej]) > 0):
            for (ej1,x) in ei_index.get(ei,[]):
                if ej1 > ej:
                    x1 = sym.fromtag(sym.totag(x)+"*")
                    a.espans.setdefault((ei,ej),[]).append(x1)
                    prefix_labels.add(x1)
                    break

    if log.level >= 3:
        log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ]))
        log.write("\n---\n")
Beispiel #24
0
def get_nbest(goal, n_best, ambiguity_limit=None):
    if log.level >= 1:
        log.write("  Extracting derivation(s)...\n")

    result = []

    nbest = forest.NBest(goal, ambiguity_limit=ambiguity_limit)
    for deriv in itertools.islice(nbest, n_best):
        hyp = Hypothesis()
        hyp.words = [sym.tostring(e) for e in deriv.english()]
        hyp.vector = deriv.vector()
        hyp.deriv = str(deriv)

        result.append(hyp)

    return result
Beispiel #25
0
def get_nbest(goal, n_best, ambiguity_limit=None):
    if log.level >= 1:
        log.write("  Extracting derivation(s)...\n")

    result = []

    nbest = forest.NBest(goal, ambiguity_limit=ambiguity_limit)
    for deriv in itertools.islice(nbest, n_best):
        hyp = Hypothesis()
        hyp.words = [sym.tostring(e) for e in deriv.english()]
        hyp.vector = deriv.vector()
        hyp.deriv = str(deriv)

        result.append(hyp)

    return result
Beispiel #26
0
def _ded_to_xml(node, result, memo, mode, models, weights):
    if weights:
        result.append(
            '<and label=%s cost=%s>' %
            (xml.sax.saxutils.quoteattr(str(id(node.rule))),
             xml.sax.saxutils.quoteattr(str(weights.dot(node.dcost)))))
    else:
        result.append('<and label=%s>' %
                      (xml.sax.saxutils.quoteattr(str(id(node)))))

    result.append('<features>')
    for f, v in node.dcost.iteritems():
        result.append('<feature name=%s value=%s/>' %
                      (xml.sax.saxutils.quoteattr(f),
                       xml.sax.saxutils.quoteattr(str(v))))
    result.append('</features>')

    if mode == 'french':
        children = node.rule.f if node.rule else node.ants
    elif mode == 'english':
        children = node.rule.e if node.rule else node.ants
    else:
        children = node.ants

    for child in children:
        if isinstance(child, Item):
            _item_to_xml(child,
                         result,
                         memo,
                         mode=mode,
                         models=models,
                         weights=weights)
        elif sym.isvar(child):
            _item_to_xml(node.ants[sym.getindex(child) - 1],
                         result,
                         memo,
                         mode=mode,
                         models=models,
                         weights=weights)
        else:
            result.append('<leaf label=%s/>' %
                          xml.sax.saxutils.quoteattr(sym.tostring(child)))
    result.append('</and>')
Beispiel #27
0
def _ded_to_text(node, result, memo, mode=None, weights=None):
    # Convert rule and features into single tokens
    #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost)
    # lhuang: in case no weights
    vstr = "cost:%s" % weights.dot(node.dcost) if weights is not None \
           else "_"
    rstr = id(node.rule)
    #rstr = id(node)
    s = "ruleid=%s<value=%s>" % (rstr,vstr)
    print "\truleid=%s" % rstr,
    
    if False and len(node.ants) == 0: # the format allows this but only if we don't tag with an id. but we tag everything with an id
        result.append(s)
    else:
        result.append('(')
        result.append(s)
        if mode == 'french':
            children = node.rule.f if node.rule else node.ants
        elif mode == 'english':
            # lhuang: default mode: english side
            children = node.rule.e if node.rule else node.ants
        else:
            children = node.ants

        for child in children:
            if isinstance(child, Item):
                result.append(' it ')
                _item_to_text(child, result, memo, mode=mode, weights=weights)
            elif sym.isvar(child):
                # lhuang: variable, do recursion
                result.append(' var ')
                _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights)
            else:
                # lhuang: english word
                result.append(' word ')
                w = quoteattr(sym.tostring(child))
                result.append(w)
                print w,
        result.append(')')

    print # end of a hyperedge
Beispiel #28
0
def get_gold(sent, goal, weights):
    """Assumes that oraclemodel.input() has been called"""
    oracleweights = theoracle.make_weights(additive=True)
    # we use the in-place operations because oracleweights might be
    # a subclass of Vector
    oracleweights *= -hope_weight
    oracleweights += weights

    goal.reweight(oracleweights)

    goldv, gold = decoder.get_nbest(goal, 1, 1)[0]
    goldscore = get_score(goldv, gold)

    log.write("gold hyp: %s %s cost=%s score=%s\n" %
              (" ".join(sym.tostring(e)
                        for e in gold), goldv, weights.dot(goldv), goldscore))

    # the learner MUST not see the oracle features
    goldv = theoracle.clean(goldv)

    return goldv, gold, goldscore
Beispiel #29
0
    def finish(self, v, words):
        """Return a copy of v that contains only the features relevant
        to computing a score. We can also perform any necessary
        corrections to v that are possible knowing the whole
        output."""

        # Actually, for BLEU we just recompute from scratch

        # postprocessing: delete non-ASCII chars and @UNKNOWN@
        words = [sym.tostring(w) for w in words]
        words = " ".join(words)
        words = "".join(c for c in words if ord(c) < 128)
        words = [sym.fromstring(word) for word in words.split()]

        v = svector.Vector()

        cand = collections.defaultdict(int)
        for o in xrange(self.order):
            for i in xrange(len(words) - o):
                cand[tuple(words[i:i + o + 1])] += 1

        match = collections.defaultdict(int)
        for ngram in cand:
            match[len(ngram) - 1] += min(cand[ngram],
                                         self.oraclemodel.refngrams[ngram])

        for o in xrange(self.order):
            v["oracle.match%d" % o] = match[o]
            v["oracle.guess%d" % o] = max(0, len(words) - o)

        v["oracle.srclen"] = self.wordcounter.srclen
        v["oracle.candlen"] = len(words)

        if self.variant == "ibm":
            v["oracle.reflen"] = min(
                (abs(l - len(words)), l) for l in self.wordcounter.reflens)[1]
        else:
            v["oracle.reflen"] = self.wordcounter.reflen

        return v
Beispiel #30
0
def get_gold(sent, goal, weights):
    """Assumes that oraclemodel.input() has been called"""
    oracleweights = theoracle.make_weights(additive=True)
    # we use the in-place operations because oracleweights might be
    # a subclass of Vector
    oracleweights *= -hope_weight
    oracleweights += weights

    goal.reweight(oracleweights)

    goldv, gold = decoder.get_nbest(goal, 1, 1)[0]
    goldscore = get_score(goldv, gold)

    log.write(
        "gold hyp: %s %s cost=%s score=%s\n"
        % (" ".join(sym.tostring(e) for e in gold), goldv, weights.dot(goldv), goldscore)
    )

    # the learner MUST not see the oracle features
    goldv = theoracle.clean(goldv)

    return goldv, gold, goldscore
Beispiel #31
0
    def finish(self, v, words):
        """Return a copy of v that contains only the features relevant
        to computing a score. We can also perform any necessary
        corrections to v that are possible knowing the whole
        output."""
        
        # Actually, for BLEU we just recompute from scratch

        # postprocessing: delete non-ASCII chars and @UNKNOWN@
        words = [sym.tostring(w) for w in words]
        words = " ".join(words)
        words = "".join(c for c in words if ord(c) < 128)
        words = [sym.fromstring(word) for word in words.split()]

        v = svector.Vector()

        cand = collections.defaultdict(int)
        for o in xrange(self.order):
            for i in xrange(len(words)-o):
                cand[tuple(words[i:i+o+1])] += 1

        match = collections.defaultdict(int)
        for ngram in cand:
            match[len(ngram)-1] += min(cand[ngram], self.oraclemodel.refngrams[ngram])
        
        for o in xrange(self.order):
            v["oracle.match%d" % o] = match[o]
            v["oracle.guess%d" % o] = max(0,len(words)-o)

        v["oracle.srclen"] = self.wordcounter.srclen
        v["oracle.candlen"] = len(words)
        
        if self.variant == "ibm":
            v["oracle.reflen"] = min((abs(l-len(words)), l) for l in self.wordcounter.reflens)[1]
        else:
            v["oracle.reflen"] = self.wordcounter.reflen

        return v
Beispiel #32
0
def _ded_to_text(node, result, memo, mode=None, weights=None):
    # Convert rule and features into single tokens
    #vstr = ",".join("%s:%s" % (quotefeature(f),node.dcost[f]) for f in node.dcost)
    vstr = "cost:%s" % weights.dot(node.dcost)
    #rstr = id(node.rule)
    rstr = id(node)
    s = "%s<%s>" % (rstr, vstr)
    if False and len(
            node.ants
    ) == 0:  # the format allows this but only if we don't tag with an id. but we tag everything with an id
        result.append(s)
    else:
        result.append('(')
        result.append(s)
        if mode == 'french':
            children = node.rule.f if node.rule else node.ants
        elif mode == 'english':
            children = node.rule.e if node.rule else node.ants
        else:
            children = node.ants

        for child in children:
            if isinstance(child, Item):
                result.append(' ')
                _item_to_text(child, result, memo, mode=mode, weights=weights)
            elif sym.isvar(child):
                result.append(' ')
                _item_to_text(node.ants[sym.getindex(child) - 1],
                              result,
                              memo,
                              mode=mode,
                              weights=weights)
            else:
                result.append(' ')
                result.append(quoteattr(sym.tostring(child)))
        result.append(')')
Beispiel #33
0
 def __str__(self):
     return "%s ::= %s (%d rules)" % (sym.tostring(self.lhs), str(self.f), len(self.rules))
Beispiel #34
0
 def strstate(self, state):
     return " ".join(sym.tostring(s) for s in state)
Beispiel #35
0
                    % (len(a.etags), len(a.ewords)))

        a.espans = None
        if opts.trees:
            if ebfile is not None:
                etree = tree.str_to_tree(ebfile.readline())
                if etree is None:
                    sys.stderr.write("warning, line %d: null tree" % a.lineno)
                    a.espans = {}
                elif etree.length != len(a.ewords):
                    sys.stderr.write(
                        "warning, line %d: length mismatch between English words and trees (%d != %d)\n"
                        % (a.lineno, len(a.ewords), etree.length))
                    sys.stderr.write(
                        "  start of English sentence: %s\n" %
                        " ".join([sym.tostring(x) for x in a.ewords[:5]]))
                    a.espans = {}
                else:
                    remove_req(etree)
                    a.espans = etree.spans()
                    for (span, labels) in a.espans.iteritems():
                        a.espans[span] = [sym.fromtag(x) for x in labels]

        # done reading all input lines
        if opts.discard_long_sentences and len(a.fwords) > opts.maxabslen:
            continue

        realcount += 1
        if opts.parallel is not None:
            if realcount % opts.parallel[1] != opts.parallel[
                    0] % opts.parallel[1]:
                fcount[a.fwords[i]] = fcount.get(a.fwords[i],0)+1
                ecount[null] = ecount.get(null,0)+1
                fecount[(a.fwords[i],null)] = fecount.get((a.fwords[i],null),0)+1
        for j in xrange(len(a.ewords)):
            if not a.ealigned[j]:
                count += 1
                fcount[null] = fcount.get(null,0)+1
                ecount[a.ewords[j]] = ecount.get(a.ewords[j],0)+1
                fecount[(null,a.ewords[j])] = fecount.get((null,a.ewords[j]),0)+1

        progress += 1
        if progress % 10000 == 0:
            sys.stderr.write(".")

    # Dump lexical weights
    for (fword,eword) in fecount.keys():
        if opts.ratiofile:
            # f|e
            c12 = fecount[fword,eword]
            c1 = ecount[eword]
            c2 = fcount[fword]
            p = float(c2)/count
            p1 = float(c12)/c1
            p2 = float(c2-c12)/(count-c1)
            ratiofile.write("%s %s %f\n" % (sym.tostring(eword), sym.tostring(fword), -2*llr(count,ecount[eword],fcount[fword],fecount[fword,eword])))
        if opts.weightfiles:
            fweightfile.write("%s %s %f\n" % (sym.tostring(fword), sym.tostring(eword), float(fecount[(fword,eword)])/ecount[eword]))
            eweightfile.write("%s %s %f\n" % (sym.tostring(eword), sym.tostring(fword), float(fecount[(fword,eword)])/fcount[fword]))

    sys.stderr.write("\n")
Beispiel #37
0
 def __str__(self):
     if self.x is None:
         return "[Goal]"
     else:
         return "[%s,%d,%d,%s,cost=%s]" % (sym.tostring(
             self.x), self.i, self.j, str(self.states), self.viterbi)
Beispiel #38
0
    def traverse(self, right_idx=0, right_widx=0, fsent=None, rules=None, nodememo=None):        
        ''' helper called by dump(); returns a string; figure out span'''

        if nodememo is None:
            nodememo = {}

        if id(self) in nodememo:
            return

        deds = [(ded.dcost.dot(weights), ded) for ded in self.deds]
        deds.sort()
        
        deds = [x for _, x in deds[:max_edges_per_node]]
        self.deds = deds # prune!

        nedges = len(deds)  # accumulating number of edges, recursively
        
        self.i = right_idx
        self.wi = right_widx

        for dedid, ded in enumerate(deds):
            try:
                rule = rules[ded.ruleid]
            except:
                print >> sys.stderr, "WARNING: rule %d not found" % ded.ruleid
                ## assuming it's a one-word UNKNOWN rule
                ## TODO: check with lattice
                unkword = fsent[self.wi]
                rule = 'UNKNOWN("@UNKNOWN@") -> "%s"' % unkword  # in reverse order
                rules[ded.ruleid] = rule
                print >> sys.stderr, "         covering " + unkword
                
                
            self.x = rule.split("(", 1)[0]  # non-terminal label

            # analyse RHS (chinese side)
            lhs, rhs = rule.split(" -> ", 1) ## -> might be a word

            # deal with lhs; convert to ded.lhsstr = ["...", "...", Item(...), "..."]
            varid = 0
            lhsstr = []
            for child in ded.rule.e:
                if sym.isvar(child):
                    lhsstr.append(ded.ants[varid])
                    varid += 1
                else:
                    lhsstr.append(quoteattr(sym.tostring(child)))

            # will be used in _dump()
            ded.lhsstr = lhsstr                
            
            vars = []
            chars_in_gap = 0
            words_in_gap = 0
            for it in reversed(rhs.split()):  ## from RIGHT to LEFT!! N.B. can't split(" ")
                if it[0] == "x":
                    #variable:
                    var = int(it[1:])
                    vars.append((var, chars_in_gap, words_in_gap))
                    chars_in_gap = 0
                    words_in_gap = 0
                else:
                    # strip off quotes "..."
                    it = it[1:-1]
                    # calculate char-length
                    if it == foreign_sentence_tag: # <foreign-sentence>:
                        # glue symbol is not counted!
                        chars_in_gap += 0
                        words_in_gap += 0
                    else:
                        # 1 for word, len(...) for char
                        chars_in_gap += len(words_to_chars(it, encode_back=True)) 
                        words_in_gap += 1

            accumu = self.i  ## left boundary
            waccumu = self.wi
            for i, c_gap, w_gap in vars:
            ##for sub in ded.ants:
                sub = ded.ants[i]
                if id(sub) not in nodememo:
                    sub.traverse(accumu + c_gap, waccumu + w_gap, fsent, rules, nodememo)
                    # accumulating # of edges (if first seen)
                    nedges += nodememo[id(sub)][1]

                ## don't accumulate subs now; will do in another visit
##                s += subs
                accumu = sub.j
                waccumu = sub.wj

            tmp_j = (ded.ants[vars[-1][0]].j if vars != [] else self.i) + chars_in_gap
            if self.j is not None and self.j != tmp_j:
                assert False, "@sentence %d, node #%s, %d %d != %d %s rule %d" % \
                       (opts.sentid, self.nodeid, self.i, self.j, tmp_j, self.x, ded.ruleid)
            self.j = tmp_j

            tmp_wj = (ded.ants[vars[-1][0]].wj if vars != [] else self.wi) + words_in_gap ##
            self.wj = tmp_wj
                
        self.id = len(nodememo) + 1
        nodememo[id(self)] = (self.id, nedges)
Beispiel #39
0
 def __reduce__(self):
     return (Item, (sym.tostring(self.x), i, j, self.deds))
Beispiel #40
0
 def __str__(self):
     if self.x is None:
         return "[Goal]"
     else:
         return "[%s,%d,%d,%s,cost=%s]" % (sym.tostring(self.x),self.i,self.j,str(self.states),self.viterbi)
Beispiel #41
0
 def __reduce__(self):
     return (Item, (sym.tostring(self.x), i, j, self.deds))
Beispiel #42
0
    def process(sent):
        global alphas

        if online_learning:
            updates.clear()
            alphas.clear()

        theoracle.input(sent)

        log.write("done preparing\n")

        global decoder_errors
        try:
            goal = thedecoder.translate(sent)
            thedecoder.process_output(sent, goal)
            decoder_errors = 0
            if goal is None:
                raise Exception("parse failure")
        except Exception:
            import traceback

            log.writeln(
                "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info())))
            )
            decoder_errors += 1
            if decoder_errors >= 100:
                log.write("decoder failed too many times, passing exception through!\n")
                raise
            else:
                return

        goal.rescore(theoracle.models, thedecoder.weights, add=True)

        bestv, best = decoder.get_nbest(goal, 1)[0]
        log.write("done decoding\n")

        bestscore = get_score(bestv, best)
        log.write(
            "best hyp: %s %s cost=%s score=%s\n"
            % (" ".join(sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore)
        )

        goldv, gold, goldscore = get_gold(sent, goal, thedecoder.weights)

        assert (
            sent.id not in updates
        )  # in batch learning, this can happen, and we would have to undo the update associated with this sentence

        updates[sent.id] = [(svector.Vector(), 0.0)]
        alphas[sent.id] = [max_learning_rate]

        if opts.parallel:
            while True:
                if mpi.world.iprobe(tag=1):
                    (sentid, vscores) = mpi.world.recv(tag=1)
                    log.write("received update for %s\n" % (sentid,))

                    if sentid in updates:  # see comment above
                        log.write("ignoring update for %s\n" % (sentid,))
                        continue  # drop this update on the floor

                    updates[sentid] = vscores
                    alphas[sentid] = [max_learning_rate] + [0.0] * (len(vscores) - 1)
                    # since the first update is zero, the alphas & updates
                    # are still consistent with weights
                else:
                    break

        def oracle(weights):
            hyps = get_hyps(sent, goal, weights)
            return [(goldv - hypv, goldscore - hypscore) for (hypv, hyp, hypscore) in hyps]

        thedecoder.weights, alphas = cutting_plane(thedecoder.weights, updates, alphas, {sent.id: oracle})

        remove_zeros(thedecoder.weights)
        log.write("feature weights: %s\n" % (thedecoder.weights * watch_features))
        log.write("weight norm: %s\n" % (math.sqrt(thedecoder.weights.normsquared())))

        # update weight sum for averaging
        global nweights, sumweights_helper

        # sumweights_helper = \sum_{i=0}^n (i \Delta w_i)
        for sentid in updates:
            for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]):
                apply_update(sumweights_helper, nweights * alpha * v)
        nweights += 1

        # update feature scales
        if update_feature_scales:
            global sum_updates2, n_updates, feature_scales
            for sentid in updates:
                u = svector.Vector()
                for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]):
                    u += alpha / max_learning_rate * v
                sum_updates2 += u * u
                n_updates += 1

            try:
                default_feature_scale = 1.0 / compute_variance(0, n_updates)
            except ZeroDivisionError:
                default_feature_scale = 0.0  # pseudoinverse
            feature_scales = collections.defaultdict(lambda: default_feature_scale)
            for feat in sum_updates2:
                try:
                    feature_scales[feat] = 1.0 / compute_variance(sum_updates2[feat], n_updates)
                except ZeroDivisionError:
                    feature_scales[feat] = 0.0  # pseudoinverse

            log.write(
                "feature scales: %s\n"
                % (" ".join("%s=%s" % (f, feature_scales[f]) for f in watch_features if f in feature_scales))
            )

        if opts.parallel:
            # flush out filled requests
            global requests
            requests = [request for request in requests if not request.test()]

            # transmit updates to other nodes
            for node in parallel.slaves:
                if node != parallel.rank:
                    requests.append(mpi.world.isend(value=(sent.id, updates[sent.id]), dest=node, tag=1))

        bestv = theoracle.finish(bestv, best)
        theoracle.update(bestv)
        sent.score_comps = bestv

        if log.level >= 1:
            gc.collect()
            log.write("done updating, memory = %s\n" % monitor.memory())

        sent.ewords = [sym.tostring(e) for e in best]

        return sent
Beispiel #43
0
    def process(sent):
        goal = thedecoder.translate(sent)

        thedecoder.process_output(sent, goal)

        if goal is None:
            return None

        if opts.forest_dir:
            forest_file = gzip.open(os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w")
            forest_file.write(forest.forest_to_json(goal, fwords=sent.fwords, mode='english', models=thedecoder.models, weights=thedecoder.weights))
            forest_file.close()

        if opts.rule_posterior_dir:
            rule_posterior_file = open(os.path.join(opts.rule_posterior_dir, "rule_posterior.%s" % sent.id), "w")
            beta = 1.
            insides = goal.compute_inside(thedecoder.weights, beta=beta)
            outsides = goal.compute_outside(thedecoder.weights, insides, beta=beta)
            z = insides[id(goal)]
            for item in goal.bottomup():
                for ded in item.deds:
                    c = outsides[id(item)]
                    c += thedecoder.weights.dot(ded.dcost)
                    c += sum(insides[id(ant)] for ant in ded.ants)
                    c -= z
                    rule_posterior_file.write("%s ||| span=%s posterior=%s\n" % (ded.rule, (item.i, item.j), cost.prob(c)))
                    ded.dcost['posterior'] = c
            rule_posterior_file.close()
            max_posterior_file = open(os.path.join(opts.rule_posterior_dir, "max_posterior.%s" % sent.id), "w")
            goal.reweight(svector.Vector('posterior=1'))
            max_posterior = goal.viterbi_deriv()

            def show(ded, antvalues):
                if ded.rule:
                    value = ded.rule.e.subst((), antvalues)
                else:
                    value = antvalues[0]
                return ("[%.3f" % cost.prob(ded.dcost['posterior']),) + value + ("]",)
            value = max_posterior.value(show)
            s = " ".join((sym.tostring(e) if type(e) is int else e) for e in value)
            max_posterior_file.write("%s\n" % s)

            max_posterior_file.close()

        outputs = get_nbest(goal, n_best, ambiguity_limit)

        if n_best_file:
            for (v,e) in outputs:
                e = " ".join(sym.tostring(w) for w in e)
                #n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, -thedecoder.weights.dot(v)))
                n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, v))
            n_best_file.flush()

        (bestv,best) = outputs[0]

        if french_parse_file:
            french_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().french_tree()))
            french_parse_file.flush()
        if english_parse_file:
            english_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().english_tree()))
            english_parse_file.flush()

        if log.level >= 1:
            gc.collect()
            log.write("  done decoding, memory=%s\n" % monitor.memory())
            log.write("  features: %s; %s\n" % (bestv, thedecoder.weights.dot(bestv)))

        sent.ewords = [sym.tostring(e) for e in best]
        return sent
Beispiel #44
0
                fecount[(null, a.ewords[j])] = fecount.get(
                    (null, a.ewords[j]), 0) + 1

        progress += 1
        if progress % 10000 == 0:
            sys.stderr.write(".")

    # Dump lexical weights
    for (fword, eword) in fecount.keys():
        if opts.ratiofile:
            # f|e
            c12 = fecount[fword, eword]
            c1 = ecount[eword]
            c2 = fcount[fword]
            p = float(c2) / count
            p1 = float(c12) / c1
            p2 = float(c2 - c12) / (count - c1)
            ratiofile.write("%s %s %f\n" %
                            (sym.tostring(eword), sym.tostring(fword),
                             -2 * llr(count, ecount[eword], fcount[fword],
                                      fecount[fword, eword])))
        if opts.weightfiles:
            fweightfile.write("%s %s %f\n" %
                              (sym.tostring(fword), sym.tostring(eword),
                               float(fecount[(fword, eword)]) / ecount[eword]))
            eweightfile.write("%s %s %f\n" %
                              (sym.tostring(eword), sym.tostring(fword),
                               float(fecount[(fword, eword)]) / fcount[fword]))

    sys.stderr.write("\n")
Beispiel #45
0
def output(f):
    deriv = f.viterbi_deriv()
    hypv = deriv.vector()
    hyp = deriv.english()
    return "hyp={{{%s}}} derivation={{{%s}}} %s" % (" ".join(sym.tostring(e) for e in hyp), deriv, hypv)
Beispiel #46
0
    def process(sent):
        global alphas

        if online_learning:
            updates.clear()
            alphas.clear()

        theoracle.input(sent)

        log.write("done preparing\n")

        global decoder_errors
        try:
            goal = thedecoder.translate(sent)
            thedecoder.process_output(sent, goal)
            decoder_errors = 0
            if goal is None: raise Exception("parse failure")
        except Exception:
            import traceback
            log.writeln(
                "decoder raised exception: %s %s" %
                (sent, "".join(traceback.format_exception(*sys.exc_info()))))
            decoder_errors += 1
            if decoder_errors >= 100:
                log.write(
                    "decoder failed too many times, passing exception through!\n"
                )
                raise
            else:
                return

        goal.rescore(theoracle.models, thedecoder.weights, add=True)

        bestv, best = decoder.get_nbest(goal, 1)[0]
        log.write("done decoding\n")

        bestscore = get_score(bestv, best)
        log.write("best hyp: %s %s cost=%s score=%s\n" % (" ".join(
            sym.tostring(e)
            for e in best), bestv, thedecoder.weights.dot(bestv), bestscore))

        goldv, gold, goldscore = get_gold(sent, goal, thedecoder.weights)

        assert (
            sent.id not in updates
        )  # in batch learning, this can happen, and we would have to undo the update associated with this sentence

        updates[sent.id] = [(svector.Vector(), 0.)]
        alphas[sent.id] = [max_learning_rate]

        if opts.parallel:
            while True:
                if mpi.world.iprobe(tag=1):
                    (sentid, vscores) = mpi.world.recv(tag=1)
                    log.write("received update for %s\n" % (sentid, ))

                    if sentid in updates:  # see comment above
                        log.write("ignoring update for %s\n" % (sentid, ))
                        continue  # drop this update on the floor

                    updates[sentid] = vscores
                    alphas[sentid] = [max_learning_rate
                                      ] + [0.] * (len(vscores) - 1)
                    # since the first update is zero, the alphas & updates
                    # are still consistent with weights
                else:
                    break

        def oracle(weights):
            hyps = get_hyps(sent, goal, weights)
            return [(goldv - hypv, goldscore - hypscore)
                    for (hypv, hyp, hypscore) in hyps]

        thedecoder.weights, alphas = cutting_plane(thedecoder.weights, updates,
                                                   alphas, {sent.id: oracle})

        remove_zeros(thedecoder.weights)
        log.write("feature weights: %s\n" %
                  (thedecoder.weights * watch_features))
        log.write("weight norm: %s\n" %
                  (math.sqrt(thedecoder.weights.normsquared())))

        # update weight sum for averaging
        global nweights, sumweights_helper

        # sumweights_helper = \sum_{i=0}^n (i \Delta w_i)
        for sentid in updates:
            for (v, score), alpha in itertools.izip(updates[sentid],
                                                    alphas[sentid]):
                apply_update(sumweights_helper, nweights * alpha * v)
        nweights += 1

        # update feature scales
        if update_feature_scales:
            global sum_updates2, n_updates, feature_scales
            for sentid in updates:
                u = svector.Vector()
                for (v,
                     score), alpha in itertools.izip(updates[sentid],
                                                     alphas[sentid]):
                    u += alpha / max_learning_rate * v
                sum_updates2 += u * u
                n_updates += 1

            try:
                default_feature_scale = 1. / compute_variance(0, n_updates)
            except ZeroDivisionError:
                default_feature_scale = 0.  # pseudoinverse
            feature_scales = collections.defaultdict(
                lambda: default_feature_scale)
            for feat in sum_updates2:
                try:
                    feature_scales[feat] = 1. / compute_variance(
                        sum_updates2[feat], n_updates)
                except ZeroDivisionError:
                    feature_scales[feat] = 0.  # pseudoinverse

            log.write(
                "feature scales: %s\n" %
                (" ".join("%s=%s" % (f, feature_scales[f])
                          for f in watch_features if f in feature_scales)))

        if opts.parallel:
            # flush out filled requests
            global requests
            requests = [request for request in requests if not request.test()]

            # transmit updates to other nodes
            for node in parallel.slaves:
                if node != parallel.rank:
                    requests.append(
                        mpi.world.isend(value=(sent.id, updates[sent.id]),
                                        dest=node,
                                        tag=1))

        bestv = theoracle.finish(bestv, best)
        theoracle.update(bestv)
        sent.score_comps = bestv

        if log.level >= 1:
            gc.collect()
            log.write("done updating, memory = %s\n" % monitor.memory())

        sent.ewords = [sym.tostring(e) for e in best]

        return sent
Beispiel #47
0
            a.etags = etfile.readline().split()
            if len(a.ftags) != len(a.fwords):
                sys.stderr.write("warning: length mismatch between French words and tags (%d != %d)\n" % (len(a.ftags), len(a.fwords)))
            if len(a.etags) != len(a.ewords):
                sys.stderr.write("warning: length mismatch between English words and tags (%d != %d)\n" % (len(a.etags), len(a.ewords)))

        a.espans = None
        if opts.trees:
            if ebfile is not None:
                etree = tree.str_to_tree(ebfile.readline())
                if etree is None:
                    sys.stderr.write("warning, line %d: null tree" % a.lineno)
                    a.espans = {}
                elif etree.length != len(a.ewords):
                    sys.stderr.write("warning, line %d: length mismatch between English words and trees (%d != %d)\n" % (a.lineno, len(a.ewords), etree.length))
                    sys.stderr.write("  start of English sentence: %s\n" % " ".join([sym.tostring(x) for x in a.ewords[:5]]))
                    a.espans = {}
                else:
                    remove_req(etree)
                    a.espans = etree.spans()
                    for (span, labels) in a.espans.iteritems():
                        a.espans[span] = [sym.fromtag(x) for x in labels]

        # done reading all input lines
        if opts.discard_long_sentences and len(a.fwords) > opts.maxabslen:
            continue

        realcount += 1
        if opts.parallel is not None:
            if realcount % opts.parallel[1] != opts.parallel[0] % opts.parallel[1]:
                continue
Beispiel #48
0
 def strstate(self, state):
     return " ".join(sym.tostring(s) for s in state)