Esempio n. 1
0
def add_multiconstituents(a, maxabslen, ephrase_index, consts):
    elen = len(a.ewords)

    chart = [[None for ej in xrange(elen + 1)] for ei in xrange(elen + 1)]
    for ((ei, ej), labels) in a.espans.iteritems():
        chart[ei][ej] = [labels[0]]  # take the highest label

    for el in xrange(2, maxabslen + 1):
        for ei in xrange(elen - el + 1):
            ej = ei + el
            if chart[ei][ej] is not None:  # must be a singleton
                continue
            bestsplit = None
            bestlen = None
            for ek in xrange(ei + 1, ej):
                if chart[ei][ek] is not None and chart[ek][ej] is not None and (
                        bestlen is None
                        or len(chart[ei][ek]) + len(chart[ek][ej]) < bestlen):
                    bestsplit = ek
                    bestlen = len(chart[ei][ek]) + len(chart[ek][ej])
            if bestlen is not None and bestlen <= consts:
                chart[ei][ej] = chart[ei][bestsplit] + chart[bestsplit][ej]
    for (ei, ej) in ephrase_index:
        if not a.espans.has_key((ei, ej)) and chart[ei][ej] is not None:
            a.espans[ei, ej] = [
                sym.fromtag("_".join(sym.totag(x) for x in chart[ei][ej]))
            ]
Esempio n. 2
0
def add_constituent_prefixes(a, ephrase_index):
    """if a phrase is a prefix of a constituent, give it a fake label"""
    if log.level >= 3:
        log.write(
            str([(i, j, sym.tostring(x))
                 for ((i, j), l) in a.espans.iteritems() for x in l]))
        log.write("\n")

    ei_index = {}
    for ((ei, ej), labels) in a.espans.iteritems():
        ei_index.setdefault(ei, []).extend([(ej, x) for x in reversed(labels)])
    for ei in ei_index.iterkeys():
        ei_index[ei].sort()  # stable

    for (ei, ej) in ephrase_index:
        if True or not (a.espans.has_key(
            (ei, ej)) and len(a.espans[ei, ej]) > 0):
            for (ej1, x) in ei_index.get(ei, []):
                if ej1 > ej:
                    x1 = sym.fromtag(sym.totag(x) + "*")
                    a.espans.setdefault((ei, ej), []).append(x1)
                    prefix_labels.add(x1)
                    break

    if log.level >= 3:
        log.write(
            str([(i, j, sym.tostring(x))
                 for ((i, j), l) in a.espans.iteritems() for x in l]))
        log.write("\n---\n")
Esempio n. 3
0
def add_sister_prefixes_helper(a, ephrases, enode, i):
    """if a phrase comprises one or more (but not all) leftmost children of a constituent, then add it and give it a fake label"""

    j = i + enode.length
    if log.level >= 3:
        log.write("(i,j) = %s\n" % ((i, j), ))
    x = enode.label
    j1 = i
    for ci in xrange(len(enode.children)):
        child = enode.children[ci]
        j1 += child.length
        if log.level >= 3:
            log.write("(i,j1) = %s\n" % ((i, j1), ))
        if j1 < j and (i, j1) in ephrases:

            # constprefix3:
            #x1 = sym.fromtag("%s*" % x)

            # subcat-lr2:
            #subcat = [sister.label for sister in enode.children[ci+1:] if sister.required]
            #x1 = sym.fromtag("/".join(["%s*"%x]+subcat))

            # markov1:
            x1 = sym.fromtag("%s/%s" % (x, enode.children[ci + 1].label))

            # markov2:
            #x1 = sym.fromtag("%s(%s)" % (x, enode.children[ci].label))

            a.espans.setdefault((i, j1), []).append(x1)
            prefix_labels.add(x1)

    for child in enode.children:
        add_sister_prefixes_helper(a, ephrases, child, i)
        i += child.length
Esempio n. 4
0
def add_sister_prefixes_helper(a, ephrases, enode, i):
    """if a phrase comprises one or more (but not all) leftmost children of a constituent, then add it and give it a fake label"""

    j = i+enode.length
    if log.level >= 3:
        log.write("(i,j) = %s\n" % ((i,j),))
    x = enode.label
    j1 = i
    for ci in xrange(len(enode.children)):
        child = enode.children[ci]
        j1 += child.length
        if log.level >= 3:
            log.write("(i,j1) = %s\n" % ((i,j1),))
        if j1 < j and (i,j1) in ephrases:

            # constprefix3:
            #x1 = sym.fromtag("%s*" % x)

            # subcat-lr2:
            #subcat = [sister.label for sister in enode.children[ci+1:] if sister.required]
            #x1 = sym.fromtag("/".join(["%s*"%x]+subcat))

            # markov1:
            x1 = sym.fromtag("%s/%s" % (x, enode.children[ci+1].label))

            # markov2:
            #x1 = sym.fromtag("%s(%s)" % (x, enode.children[ci].label))

            a.espans.setdefault((i,j1),[]).append(x1)
            prefix_labels.add(x1)

    for child in enode.children:
        add_sister_prefixes_helper(a, ephrases, child, i)
        i += child.length
Esempio n. 5
0
def make_forest(fieldss):
    nodes = {}
    goal_ids = set()
    for fields in fieldss:
        node_id = fields['hyp']
        if node_id not in nodes:
            nodes[node_id] = forest.Item(sym.fromtag('PHRASE'), 0, 0, [])
        node = nodes[node_id]

        if node_id == 0:
            r = rule.Rule(sym.fromtag('PHRASE'), rule.Phrase([]), rule.Phrase([]))
            node.deds.append(forest.Deduction((), r, svector.Vector()))
        else:
            m = scores_re.match(fields['scores'])
            core_values = [float(x) for x in m.group(1).split(',')]
            dcost = svector.Vector(m.group(2).encode('utf8'))
            for i, x in enumerate(core_values):
                dcost["_core%d" % i] = x

            back = int(fields['back'])
            ant = nodes[back]
            f = fields['src-phrase'].encode('utf8').split()
            e = fields['tgt-phrase'].encode('utf8').split()
            if len(f) != int(fields['cover-end']) - int(fields['cover-start']) + 1:
                sys.stderr.write("warning: French phrase length didn't match covered length\n")

            f = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + f)
            e = rule.Phrase([sym.setindex(sym.fromtag('PHRASE'), 1)] + e)
            r = rule.Rule(sym.fromtag('PHRASE'), f, e)

            ded = forest.Deduction((ant,), r, dcost)
            node.deds.append(ded)

            if int(fields['forward']) < 0: # goal
                goal_ids.add(node_id)

    goal = forest.Item(None, 0, 0, [])
    for node_id in goal_ids:
        goal.deds.append(forest.Deduction((nodes[node_id],), None, svector.Vector()))
    return goal
Esempio n. 6
0
 def _str_helper(self, item, accum):
     ded = self.ded[id(item)]
     if ded.rule:
         x = ded.rule.lhs
     else:
         x = sym.fromtag("-")
     if len(ded.ants) > 0:
         accum.extend(["(", sym.totag(x)])
         for ant in ded.ants:
             accum.append(" ")
             self._str_helper(ant, accum)
         accum.append(")")
     else:
         accum.append(sym.totag(x))
Esempio n. 7
0
 def _str_helper(self, item, accum):
     ded = self.ded[id(item)]
     if ded.rule:
         x = ded.rule.lhs
     else:
         x = sym.fromtag("-")
     if len(ded.ants) > 0:
         accum.extend(["(", sym.totag(x)])
         for ant in ded.ants:
             accum.append(" ")
             self._str_helper(ant, accum)
         accum.append(")")
     else:
         accum.append(sym.totag(x))
Esempio n. 8
0
def add_bounded_prefixes_helper(a, phrases, node, i, stack):
    j = i + node.length
    if node.label in ['NP']:
        stack = stack + [(node.label, i, j)]
    else:
        stack = [(node.label, i, j)]
    i1 = i
    for child in node.children:
        if i1 > i:
            for (x, i0, j0) in stack:
                if (i0, i1) in phrases:
                    x1 = sym.fromtag("%s*" % x)
                    a.espans.setdefault((i0, i1), []).append(x1)
                    prefix_labels.add(x1)
        add_bounded_prefixes_helper(a, phrases, child, i1, stack)
        i1 += child.length
Esempio n. 9
0
def add_bounded_prefixes_helper(a, phrases, node, i, stack):
    j = i+node.length
    if node.label in ['NP']:
        stack = stack+[(node.label,i,j)]
    else:
        stack = [(node.label,i,j)]
    i1 = i
    for child in node.children:
        if i1 > i:
            for (x,i0,j0) in stack:
                if (i0,i1) in phrases:
                    x1 = sym.fromtag("%s*" % x)
                    a.espans.setdefault((i0,i1),[]).append(x1)
                    prefix_labels.add(x1)
        add_bounded_prefixes_helper(a, phrases, child, i1, stack)
        i1 += child.length
Esempio n. 10
0
    def input(self, input):
        self.rules = collections.defaultdict(list)
        for tag, attrs, i, j in input.fmeta:
            attrs = sgml.attrs_to_dict(attrs)
            if attrs.has_key('english'):
                ephrases = attrs['english'].split('|')

                if attrs.has_key('cost'):
                    costs = [float(x) for x in attrs['cost'].split('|')]
                elif attrs.has_key('prob'):
                    costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')]
                else:
                    costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform
                if len(costs) != len(ephrases):
                    sys.stderr.write("wrong number of probabilities/costs")
                    raise ValueError

                if attrs.has_key('features'):
                    features = attrs['features'].split('|')
                    if len(features) != len(ephrases):
                        sys.stderr.write("wrong number of feature names")
                        raise ValueError
                elif attrs.has_key('feature'):
                    features = [attrs['feature'] for ephrase in ephrases]
                else:
                    features = ['sgml' for ephrase in ephrases]

                if attrs.has_key('label'):
                    tags = attrs['label'].split('|')
                else:
                    tags = [tag.upper()]

                # bug: if new nonterminals are introduced at this point,
                # they will not participate in the topological sort

                for (ephrase,cost,feature) in zip(ephrases,costs,features):
                    for tag in tags:
                        r = rule.Rule(sym.fromtag(tag),
                                      rule.Phrase(input.fwords[i:j]),
                                      rule.Phrase([sym.fromstring(e) for e in ephrase.split()]),
                                      scores=svector.Vector('%s' % feature, cost))
                        self.rules[i,j].append((r,))
Esempio n. 11
0
    def input(self, lat):
        self.rules = collections.defaultdict(list)
        for span in lat.spans:
            i, j = span.i, span.j

            if hasattr(span, 'v'):
                v = svector.Vector(span.v)
            else:
                v = model.zero

            # bug: if new nonterminals are introduced at this point,
            # they will not participate in the topological sort

            r = rule.Rule(sym.fromtag(span.x),
                          rule.Phrase([sym.fromstring(f) for f in span.f]),
                          rule.Phrase([sym.fromstring(e) for e in span.e]),
                          scores=v)
            self.rules[i,j].append((r,))
            if log.level >= 2:
                log.write("added lattice rule at (%d,%d): %s\n" % (i,j,r))
Esempio n. 12
0
def add_multiconstituents(a, maxabslen, ephrase_index, consts):
    elen = len(a.ewords)

    chart = [[None for ej in xrange(elen+1)] for ei in xrange(elen+1)]
    for ((ei,ej),labels) in a.espans.iteritems():
        chart[ei][ej] = [labels[0]] # take the highest label

    for el in xrange(2,maxabslen+1):
        for ei in xrange(elen-el+1):
            ej = ei+el
            if chart[ei][ej] is not None: # must be a singleton
                continue
            bestsplit = None
            bestlen = None
            for ek in xrange(ei+1,ej):
                if chart[ei][ek] is not None and chart[ek][ej] is not None and (bestlen is None or len(chart[ei][ek])+len(chart[ek][ej]) < bestlen):
                    bestsplit = ek
                    bestlen = len(chart[ei][ek])+len(chart[ek][ej])
            if bestlen is not None and bestlen <= consts:
                chart[ei][ej] = chart[ei][bestsplit]+chart[bestsplit][ej]
    for (ei,ej) in ephrase_index:
        if not a.espans.has_key((ei,ej)) and chart[ei][ej] is not None:
            a.espans[ei,ej] = [sym.fromtag("_".join(sym.totag(x) for x in chart[ei][ej]))]
Esempio n. 13
0
def add_constituent_prefixes(a, ephrase_index):
    """if a phrase is a prefix of a constituent, give it a fake label"""
    if log.level >= 3:
        log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ]))
        log.write("\n")

    ei_index = {}
    for ((ei,ej),labels) in a.espans.iteritems():
        ei_index.setdefault(ei, []).extend([(ej,x) for x in reversed(labels)])
    for ei in ei_index.iterkeys():
        ei_index[ei].sort() # stable

    for (ei,ej) in ephrase_index:
        if True or not (a.espans.has_key((ei,ej)) and len(a.espans[ei,ej]) > 0):
            for (ej1,x) in ei_index.get(ei,[]):
                if ej1 > ej:
                    x1 = sym.fromtag(sym.totag(x)+"*")
                    a.espans.setdefault((ei,ej),[]).append(x1)
                    prefix_labels.add(x1)
                    break

    if log.level >= 3:
        log.write(str([(i,j,sym.tostring(x)) for ((i,j),l) in a.espans.iteritems() for x in l ]))
        log.write("\n---\n")
Esempio n. 14
0
                _item_to_text(node.ants[sym.getindex(child) - 1],
                              result,
                              memo,
                              mode=mode,
                              weights=weights)
            else:
                result.append(' ')
                result.append(quoteattr(sym.tostring(child)))
        result.append(')')


class TreeFormatException(Exception):
    pass


dummylabel = sym.fromtag("X")
dummyi = dummyj = None

whitespace = re.compile(r"\s+")
openbracket = re.compile(r"""(?:#(\d+))?\((\S+)""")
noderefre = re.compile(r"#([^)\s]+)")
labelre = re.compile(r"^(-?\d*)(?:<(\S+)>)?$")


def forest_lexer(s):
    si = 0
    while si < len(s):
        m = whitespace.match(s, si)
        if m:
            si = m.end()
            continue
Esempio n. 15
0
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]):
    """Currently this assumes that the only frontier nodes in the tree are words."""
    while True:
        try:
            tok = tokiter.next()
            toktype = tok[0]
        except StopIteration:
            raise TreeFormatException("incomplete tree")

        if toktype == "or":
            _, nodeid = tok
            deds = list(
                forest_from_text_helper(tokiter,
                                        memo,
                                        delete_words=delete_words))
            node = Item(dummylabel, dummyi, dummyj, deds=deds)
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == "nonterm":
            _, nodeid, ruleid, dcoststr = tok
            if ruleid == "":
                ruleid = dummylabel
            else:
                ruleid = sym.fromtag(ruleid)
            dcost = svector.Vector()
            if dcoststr:
                for fv in dcoststr.split(','):
                    f, v = fv.split(':', 1)
                    v = float(v)
                    dcost[f] = v

            ants = []
            rhs = []
            vi = 1
            for child in forest_from_text_helper(tokiter,
                                                 memo,
                                                 want_item=True,
                                                 delete_words=delete_words):
                if isinstance(child, Item):
                    ants.append(child)
                    rhs.append(sym.setindex(dummylabel, vi))
                    vi += 1
                else:
                    rhs.append(child)
            r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs))

            node = Deduction(ants=ants, rule=r, dcost=dcost)
            if want_item:  # need to insert OR node
                node = Item(dummylabel, dummyi, dummyj, deds=[node])
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == 'term':
            terminal = tok[1]
            if terminal not in delete_words:
                yield sym.fromstring(terminal)

        elif toktype == 'ref':
            yield memo[tok[1]]

        elif toktype == 'pop':
            return

        else:
            raise TreeFormatException("unknown token %s" % (tok, ))
Esempio n. 16
0
                if etree is None:
                    sys.stderr.write("warning, line %d: null tree" % a.lineno)
                    a.espans = {}
                elif etree.length != len(a.ewords):
                    sys.stderr.write(
                        "warning, line %d: length mismatch between English words and trees (%d != %d)\n"
                        % (a.lineno, len(a.ewords), etree.length))
                    sys.stderr.write(
                        "  start of English sentence: %s\n" %
                        " ".join([sym.tostring(x) for x in a.ewords[:5]]))
                    a.espans = {}
                else:
                    remove_req(etree)
                    a.espans = etree.spans()
                    for (span, labels) in a.espans.iteritems():
                        a.espans[span] = [sym.fromtag(x) for x in labels]

        # done reading all input lines
        if opts.discard_long_sentences and len(a.fwords) > opts.maxabslen:
            continue

        realcount += 1
        if opts.parallel is not None:
            if realcount % opts.parallel[1] != opts.parallel[
                    0] % opts.parallel[1]:
                continue

        for feature in features:
            feature.process_alignment(a)

        phrases = extract_phrases(a, maxabslen)
Esempio n. 17
0
def forest_from_text_helper(tokiter, memo, want_item=False, delete_words=[]):
    """Currently this assumes that the only frontier nodes in the tree are words."""
    while True:
        try:
            tok = tokiter.next()
            toktype = tok[0]
        except StopIteration:
            raise TreeFormatException("incomplete tree")

        if toktype == "or":
            _, nodeid = tok
            deds = list(forest_from_text_helper(tokiter, memo, \
                                                delete_words=delete_words))
            node = Item(dummylabel, dummyi, dummyj, deds=deds)
            if nodeid:
                memo[nodeid] = node
                node.nodeid = nodeid
            yield node

        elif toktype == "nonterm":
            _, nodeid, ruleid, dcoststr = tok
            if ruleid == "":
                ruleid = dummylabel
            else:
                # lhuang: N.B.: sym.fromtag would re-alloc it
                xrs_ruleid = int(ruleid)
                ruleid = sym.fromtag(ruleid)  #int(ruleid) #
                
            dcost = svector.Vector()
            if dcoststr:
                # lhuang: features are read from forest, not rules
                # so there is no "e^..." or "10^..."
                
                for fv in dcoststr.split(','):
                    f,v = fv.split(':',1)
                    v = float(v)
                    dcost[f] = v

            ants = []
            rhs = []
            vi = 1
            for child in forest_from_text_helper(tokiter, memo, want_item=True,\
                                                 delete_words=delete_words):
                if isinstance(child, Item):
                    ants.append(child)
                    rhs.append(sym.setindex(dummylabel, vi))
                    vi += 1
                else:
                    rhs.append(child)
            r = rule.Rule(ruleid, rule.Phrase(rhs), rule.Phrase(rhs))

            node = Deduction(ants=ants, rule=r, dcost=dcost)
            node.ruleid = xrs_ruleid
            
            if want_item: # need to insert OR node
                node = Item(dummylabel, dummyi, dummyj, deds=[node])
            if nodeid:
                memo[nodeid] = node
            yield node

        elif toktype == 'term':
            terminal = tok[1]
            if terminal not in delete_words:
                yield sym.fromstring(terminal)

        elif toktype == 'ref':
            yield memo[tok[1]]

        elif toktype == 'pop':
            return

        else:
            raise TreeFormatException("unknown token %s" % (tok,))
Esempio n. 18
0
        a.espans = None
        if opts.trees:
            if ebfile is not None:
                etree = tree.str_to_tree(ebfile.readline())
                if etree is None:
                    sys.stderr.write("warning, line %d: null tree" % a.lineno)
                    a.espans = {}
                elif etree.length != len(a.ewords):
                    sys.stderr.write("warning, line %d: length mismatch between English words and trees (%d != %d)\n" % (a.lineno, len(a.ewords), etree.length))
                    sys.stderr.write("  start of English sentence: %s\n" % " ".join([sym.tostring(x) for x in a.ewords[:5]]))
                    a.espans = {}
                else:
                    remove_req(etree)
                    a.espans = etree.spans()
                    for (span, labels) in a.espans.iteritems():
                        a.espans[span] = [sym.fromtag(x) for x in labels]

        # done reading all input lines
        if opts.discard_long_sentences and len(a.fwords) > opts.maxabslen:
            continue

        realcount += 1
        if opts.parallel is not None:
            if realcount % opts.parallel[1] != opts.parallel[0] % opts.parallel[1]:
                continue

        for feature in features:
            feature.process_alignment(a)

        phrases = extract_phrases(a, maxabslen)
        if opts.loosen:
Esempio n. 19
0
                result.append(' var ')
                _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights)
            else:
                # lhuang: english word
                result.append(' word ')
                w = quoteattr(sym.tostring(child))
                result.append(w)
                print w,
        result.append(')')

    print # end of a hyperedge

class TreeFormatException(Exception):
    pass

dummylabel = sym.fromtag("-")
dummyi = dummyj = None

whitespace = re.compile(r"\s+")
openbracket = re.compile(r"""(?:#(\d+))?\((\S+)""")
noderefre = re.compile(r"#([^)\s]+)")
labelre = re.compile(r"^(-?\d*)(?:<(\S+)>)?$")

def forest_lexer(s):
    si = 0
    while si < len(s):
        m = whitespace.match(s, si)
        if m:
            si = m.end()
            continue
Esempio n. 20
0
        for child in children:
            if isinstance(child, Item):
                result.append(' ')
                _item_to_text(child, result, memo, mode=mode, weights=weights)
            elif sym.isvar(child):
                result.append(' ')
                _item_to_text(node.ants[sym.getindex(child)-1], result, memo, mode=mode, weights=weights)
            else:
                result.append(' ')
                result.append(quoteattr(sym.tostring(child)))
        result.append(')')

class TreeFormatException(Exception):
    pass

dummylabel = sym.fromtag("X")
dummyi = dummyj = None

whitespace = re.compile(r"\s+")
openbracket = re.compile(r"""(?:#(\d+))?\((\S+)""")
noderefre = re.compile(r"#([^)\s]+)")
labelre = re.compile(r"^(-?\d*)(?:<(\S+)>)?$")

def forest_lexer(s):
    si = 0
    while si < len(s):
        m = whitespace.match(s, si)
        if m:
            si = m.end()
            continue