Example #1
0
def compare_trees(fG, fH):
    output = []

    def compare(tG, tH):
        output = ''
        if tG.c == tH.c and len(tG.words()) == len(tH.words()) and len(
                tG.ch) == len(tH.ch) and (len(tH.ch) < 2 or len(
                    tG.ch[0].words()) == len(tH.ch[0].words())):
            output += (' (' if len(tG.ch) > 0 else ' ') + tG.c
            for i in range(len(tH.ch)):
                output += compare(tG.ch[i] if i < len(tG.ch) else tree.Tree(),
                                  tH.ch[i])
            if len(tG.ch) > 0: output += ')'
        else:
            output += ' <***GOLD***> ' + str(tG) + ' <**HYPOTH**> ' + str(
                tH) + ' <**********>'

        return output

    for i, (lineH, lineG) in enumerate(zip(fH, fG)):

        tH = tree.Tree()
        tH.read(lineH)

        tG = tree.Tree()
        tG.read(lineG)

        output.append('TREE ' + str(i + 1) + ':\n')
        output.append(compare(tG, tH) + '\n')

    return output
Example #2
0
 def wrap_terms(t):
     if len(t.ch) > 1:
         for i in range(len(t.ch)):
             if term(t.ch[i]):
                 t.ch[i] = tree.Tree(pos.pop(0), [tree.Tree(t.ch[i].c, [])])
             else:
                 wrap_terms(t.ch[i])
     elif len(t.ch) == 1 and len(t.ch[0].ch) == 0:
         t.c = pos.pop(0)
     return t
Example #3
0
def trees2deps(trees_buffer, model_buffer, debug=False):
    out = []

    heads = {}
    deps = {}

    def preterms(t):
        if preterm(t):
            return [t]
        x = []
        for ch in t.ch:
            x += preterms(ch)
        return x

    def preterm(t):
        return len(t.ch) == 1 and t.ch[0].ch == []

    def get_deps(t, ix, words):
        if preterm(t):
            deps[t] = ix
            return (words.index(t) + 1)
        heads[t] = max(t.ch, key=lambda x: head_model[t.c][x])
        if debug:
            heads[t].c = 'HEAD:' + heads[t].c + '->' + str(ix)
        children = t.ch[:]
        head = children.pop(children.index(heads[t]))
        headix = get_deps(head, ix, words)
        for ch in children:
            get_deps(ch, headix, words)
            if debug:
                ch.c += '->' + str(headix)
        return (headix)

    head_model = pcfg_model.CondModel('R')
    for line in model_buffer:
        head_model.read(line)

    t = tree.Tree()

    for line in trees_buffer:
        heads = {}
        deps = {}
        t.read(line)
        preterminals = preterms(t)
        get_deps(t, 0, preterminals)
        preterminals.insert(0, tree.Tree('X', [tree.Tree('ROOT', [])]))
        if debug:
            out.append(str(t) + '\n')
        for i in range(1, len(preterminals)):
            out.append('X(' + preterminals[deps[preterminals[i]]].ch[0].c +
                       '-' + str(deps[preterminals[i]]) + ', ' +
                       str(preterminals[i].ch[0].c) + '-' + str(i) + ')\n')
        out.append('\n')

    return out
Example #4
0
        def out(inputs):
            t = tree.Tree()
            outputs = []
            for x in inputs:
                x = x.strip()
                if (x != '') and (x[0] != '%'):
                    t.read(x)
                    outputs.append(' '.join(t.words()) + '\n')

            return outputs
Example #5
0
        def out(inputs):
            t = tree.Tree()
            outputs = []
            for x in inputs:
                x = x.strip()
                if (x != '') and (x[0] != '%'):
                    t.read(x)
                    t.upper()
                    outputs.append(str(t) + '\n')

            return outputs
Example #6
0
        def out(inputs, n):
            t = tree.Tree()
            outputs = []
            for x in inputs:
                x = x.strip()
                if (x != '') and (x[0] != '%'):
                    t.read(x)
                    if len(t.words()) <= n:
                        outputs.append(x + '\n')

            return outputs
Example #7
0
def plug_leaves(trees, words):
    t = tree.Tree()

    output = []

    for tr, wr in zip(trees, words):
        t.read(tr)
        plug_words(t, wr.split())
        output.append('%s\n' % t)

    return output
Example #8
0
        def out(inputs):
            def is_curr(x):
                return x.c == '$'

            t = tree.Tree()
            outputs = []
            for i, x in enumerate(inputs):
                x = x.strip()
                if (x != '') and (x[0] != '%'):
                    t.read(x)
                    t.prune(is_curr)
                    outputs.append(str(t) + '\n')

            return outputs
Example #9
0
    def compare(tG, tH):
        output = ''
        if tG.c == tH.c and len(tG.words()) == len(tH.words()) and len(
                tG.ch) == len(tH.ch) and (len(tH.ch) < 2 or len(
                    tG.ch[0].words()) == len(tH.ch[0].words())):
            output += (' (' if len(tG.ch) > 0 else ' ') + tG.c
            for i in range(len(tH.ch)):
                output += compare(tG.ch[i] if i < len(tG.ch) else tree.Tree(),
                                  tH.ch[i])
            if len(tG.ch) > 0: output += ')'
        else:
            output += ' <***GOLD***> ' + str(tG) + ' <**HYPOTH**> ' + str(
                tH) + ' <**********>'

        return output
Example #10
0
        def out(inputs):
            mapper = {'(': '-LRB-', ')': '-RRB-'}

            def labelmap(x):
                return mapper.get(x, x)

            t = tree.Tree()
            outputs = []
            for i, x in enumerate(inputs):
                x = x.strip()
                if (x != '') and (x[0] != '%'):
                    t.read(x)
                    t.mapLabels(labelmap)
                    outputs.append(str(t) + '\n')

            return outputs
Example #11
0
        def out(inputs):
            t = tree.Tree()
            left = '(1 '
            right = ') '
            outputs = []
            for x in inputs:
                x = x.strip()
                if x != '':
                    words = x.split()
                    out = ''
                    for word in words[::-1]:
                        w = left + word + right
                        if out:
                            out = left + w + out + right
                        else:
                            out = w
                    t.read(out)
                    outputs.append(str(t) + '\n')

            return outputs
Example #12
0
              + str(getPOS(T)) + ' ' + str(depdirSyn) + ' ' + str(depdirSem) + ' ' + str(depdirSynM) + ' ' + str(depdirSemM) + ' ' \
              + str(int(isPhrasePunc(T.ch[0]))))
    else:
        if len(T.ch[0].c) > 1 and T.ch[0].c.endswith('lC') and not '-c' in T.c:
            coords.append(T)
            ends.append(last(T))
        for t in T.ch:
            printToks(t)

print('word dltdc dltdcv ' \
    + 'dlt dltc dltcv dltv ' \
    + 'dltm dltcm dltcvm dltvm ' \
    + 'pos depdirSyn depdirSem depdirSynM depdirSemM' + ' ' \
    + 'punc')

for line in sys.stdin:
    if (line.strip() != '') and (line.strip()[0] != '%'):
        terms = []
        DLTcosts = []
        DLTcostsV = []
        coords = []
        ends = []
        complete = []
        cCosts = []
        cvCosts = []
        post = [0, 0, 0, 0, 0, 0, 0, 0]
        postpost = [0, 0, 0, 0, 0, 0, 0, 0]
        T = tree.Tree()
        T.read(line)
        printToks(T)
Example #13
0
def deps2trees(buffer, format='stanford', debug=False):
    out = []
    # Regexp for extracting dependency information from a stanford dependencies file
    stan_dep = re.compile(' *[^ ]*\([^ ]+-([0-9]+) *, *([^ ]+)-([0-9]+)\)')

    # Reports whether a tree is terminal
    def term(t):
        return t.ch == []

    # Ensures that each terminal in t has a unary pre-terminal parent
    def wrap_terms(t):
        if len(t.ch) > 1:
            for i in range(len(t.ch)):
                if term(t.ch[i]):
                    t.ch[i] = tree.Tree(pos.pop(0), [tree.Tree(t.ch[i].c, [])])
                else:
                    wrap_terms(t.ch[i])
        elif len(t.ch) == 1 and len(t.ch[0].ch) == 0:
            t.c = pos.pop(0)
        return t

    # Start reading the input
    line = next(buffer)
    while line:
        # list of dependency tokens
        deps = []
        pos = []

        # Each token is on its own line, and sents are separated by newlines.
        # Reads until the end of the sentence is encountered and creates
        # a new token object for each line
        while line and not line.strip() == '':
            # Each token must have 'word', 'dep', and 'ix' fields.
            # The following lines read these in according to the
            # input format.
            if format.lower() == 'conll':
                tok = {'word': line.split()[1], 'dep': int(line.split()[7]), 'ix': int(line.split()[0])}
                pos += [str(line.split()[3])]
            elif format.lower() == 'conll-x':
                tok = {'word': line.split()[1], 'dep': int(line.split()[6]), 'ix': int(line.split()[0])}
                pos += [str(line.split()[3])]
            elif format.lower() == 'stanford':
                word = stan_dep.match(line).group(2)
                dep = stan_dep.match(line).group(1)
                ix = stan_dep.match(line).group(3)
                pos += ['X']
                tok = {'word': word, 'dep': int(dep), 'ix': int(ix)}
            else:
                raise ValueError('Unsupported format %s' % format)
            deps.append(tok)
            if debug:
                out.append('%s\n' % tok)
            line = next(buffer)

        # Dictionary of trees indexed by head sentpos
        trees = {0: tree.Tree()}

        # Add a preterminal to trees for each token in the sentence
        for tok in deps:
            trees[tok['ix']] = tree.Tree('X', [tree.Tree(tok['word'], [])])

        # Combine trees based on their dependencies (deps to 0 are the main head)
        for tok in deps:
            # Dep to 0, this is the main head
            if tok['dep'] == 0:
                trees[0] = trees[tok['ix']]
            # Dep to following head, insert tree as preceding sibling of head
            elif tok['ix'] < tok['dep']:
                trees[tok['dep']].ch.insert(-1, trees[tok['ix']])
            # Dep to preceding head, insert tree as following sibling of head
            else:
                trees[tok['dep']].ch.append(trees[tok['ix']])

        # Make sure all terminals have unary pre-terminal parents
        trees[0] = wrap_terms(trees[0])

        # Print the main tree
        out.append('%s\n' % trees[0])

        # Start reading the next sentence
        try:
            line = next(buffer)
        except StopIteration:
            line = None

    return out