Beispiel #1
0
def fall_back_left_branching_token(clean_tokens):
    tree = HybridTree()
    for i, token in enumerate(clean_tokens):
        token.set_edge_label('_')
        tree.add_node(i, token, True)
        if i == 0:
            tree.add_to_root(i)
        else:
            tree.add_child(i - 1, i)
    return tree
Beispiel #2
0
def fall_back_left_branching(forms, poss):
    tree = HybridTree()
    for i, (form, pos) in enumerate(zip(forms, poss)):
        token = construct_conll_token(form, pos)
        token.set_edge_label('_')
        tree.add_node(i, token, True)
        if i == 0:
            tree.add_to_root(i)
        else:
            tree.add_child(i - 1, i)
    return tree
Beispiel #3
0
def hybrid_tree_1():
    tree = HybridTree()
    tree.add_node('v1', CoNLLToken('Piet', '_', 'NP', 'NP', '_', 'SBJ'), True)
    tree.add_node('v21', CoNLLToken('Marie', '_', 'N', 'N', '_', 'OBJ'), True)
    tree.add_node('v', CoNLLToken('helpen', '_', 'V', 'V', '_', 'ROOT'), True)
    tree.add_node('v2', CoNLLToken('lezen', '_', 'V', 'V', '_', 'VBI'), True)
    tree.add_child('v', 'v2')
    tree.add_child('v', 'v1')
    tree.add_child('v2', 'v21')
    tree.add_to_root('v')
    tree.reorder()
    return tree
def multi_const_tree():
    tree = HybridTree("multi")
    tree.add_node('1.1', ConstituentTerminal('A', 'pA'), True, True)
    tree.add_node('2.1', ConstituentTerminal('B', 'pB'), True, True)
    tree.add_node('1.2', ConstituentTerminal('C', 'pC'), True, True)
    tree.add_node('2.2', ConstituentTerminal('D', 'pD'), True, True)
    tree.add_node('1', ConstituentCategory('E'), False, True)
    tree.add_node('2', ConstituentCategory('F'), False, True)
    for p in ['2', '1']:
        tree.add_to_root(p)
        for c in ['1', '2']:
            tree.add_child(p, p + '.' + c)
    return tree
def multi_dep_tree():
    tree = HybridTree('multi')
    tree.add_node('1', CoNLLToken('A', '_', 'pA', 'pA', '_', 'dA'), True)
    tree.add_node('211', CoNLLToken('B', '_', 'pB', 'pB', '_', 'dB'), True)
    tree.add_node('11', CoNLLToken('C', '_', 'pC', 'pC', '_', 'dC'), True)
    tree.add_node('2', CoNLLToken('D', '_', 'pD', 'pD', '_', 'dD'), True)
    tree.add_node('21', CoNLLToken('E', '_', 'pE', 'pE', '_', 'dE'), True)
    tree.add_to_root('2')
    tree.add_to_root('1')
    for c in ['21', '211']:
        tree.add_child('2', c)
    tree.add_child('1', '11')
    tree.reorder()
    return tree
Beispiel #6
0
def disconnect_punctuation(trees):
    """
    :param trees: corpus of hybrid trees
    :type trees: __generator[HybridTree]
    :return: corpus of hybrid trees
    :rtype: __generator[GeneralHybridTree]
    lazily disconnect punctuation from each hybrid tree in a corpus of hybrid trees
    """
    for tree in trees:
        tree2 = HybridTree(tree.sent_label())
        for root_id in tree.root:
            if not is_punctuation(tree.node_token(root_id).form()):
                tree2.add_to_root(root_id)
        for id in tree.full_yield():
            token = tree.node_token(id)
            if not is_punctuation(token.form()):
                parent = tree.parent(id)
                while parent and parent not in tree.root and is_punctuation(
                        tree.node_token(parent).form()):
                    parent = tree.parent(parent)
                if parent and is_punctuation(tree.node_token(parent).form()):
                    tree2.add_to_root(id)
                else:
                    tree2.add_child(parent, id)
                tree2.add_node(id, token, True, True)
            else:
                tree2.add_node(id, token, True, False)

        if tree2:
            # basic sanity checks
            if not tree2.root \
                    and len(tree2.id_yield()) == 0 \
                    and len(tree2.nodes()) == len(tree2.full_yield()):
                # Tree consists only of punctuation
                continue
            elif not tree2.root \
                    or tree2.n_nodes() != len(tree2.id_yield()) \
                    or len(tree2.nodes()) != len(tree2.full_yield()):
                print(tree)

                print(tree2)
                print(tree2.sent_label())
                print("Root:", tree2.root)
                print("Nodes: ", tree2.n_nodes())
                print("Id_yield:", len(tree2.id_yield()), tree2.id_yield())
                print("Nodes: ", len(tree2.nodes()))
                print("full yield: ", len(tree2.full_yield()))
                raise Exception()
            yield tree2
Beispiel #7
0
    def test_recursive_partitioning_transformation(self):
        tree = HybridTree("mytree")
        ids = ['a', 'b', 'c', 'd']
        for f in ids:
            tree.add_node(f, CoNLLToken(f, '_', '_', '_', '_', '_'), True,
                          True)
            if f != 'a':
                tree.add_child('a', f)
        tree.add_to_root('a')

        print(tree)
        self.assertEqual([token.form() for token in tree.token_yield()], ids)
        self.assertEqual(tree.recursive_partitioning(),
                         (set([0, 1, 2, 3]), [(set([0]), []), (set([1]), []),
                                              (set([2]), []), (set([3]), [])]))
        print(tree.recursive_partitioning())

        [fanout_1
         ] = the_recursive_partitioning_factory().get_partitioning('fanout-1')

        print(fanout_1(tree))
Beispiel #8
0
def derivation_to_hybrid_tree(der,
                              poss,
                              ordered_labels,
                              construct_token,
                              disconnected=None):
    """
    :param der:
    :type der: LCFRSDerivation
    :param poss: list of POS-tags
    :type poss: list[str]
    :param ordered_labels: list of words
    :type ordered_labels: list[str]
    :param disconnected: list of positions in ordered_labels that are disconnected
    :type disconnected: list[object]
    :rtype: GeneralHybridTree
    Turn a derivation tree into a hybrid tree. Assuming poss and ordered_labels to have equal length.
    """
    if not disconnected:
        disconnected = []
    tree = HybridTree()
    j = 1
    for i in range(len(ordered_labels)):
        token = construct_token(ordered_labels[i], poss[i], True)
        if i in disconnected:
            tree.add_node("d" + str(i), token, True, False)
        else:
            tree.add_node("c" + str(j), token, True, True)
            j += 1
    for id in der.ids():
        token = construct_token(der.getRule(id).lhs().nont(), '_', False)
        tree.add_node(id, token)
        for child in der.child_ids(id):
            tree.add_child(id, child)
        for position in der.terminal_positions(id):
            tree.add_child(id, "c" + str(position))
    tree.add_to_root(der.root_id())
    tree.reorder()
    return tree
def query_result_tree(connection, exp, tree_id):
    """
    :param connection:
    :param exp:
    :param tree_id:
    :rtype: str, HybridTree
    :return:
    """
    cursor = connection.cursor()
    result_tree_ids = cursor.execute(
        '''SELECT rt_id, status FROM result_trees WHERE exp_id = ? AND t_id = ?''',
        (exp, tree_id)).fetchall()

    # parse:
    if result_tree_ids:
        assert (len(result_tree_ids) == 1)
        result_tree_id, status = result_tree_ids[0]
        if status in ["parse", "fallback"]:
            name = cursor.execute('''SELECT name FROM trees WHERE t_id = ?''',
                                  (tree_id, )).fetchall()[0][0]
            tree_nodes = cursor.execute((
                ' SELECT tree_nodes.sent_position, label, pos, result_tree_nodes.head, result_tree_nodes.deprel FROM result_tree_nodes\n'
                '                JOIN result_trees\n'
                '                  ON result_tree_nodes.rt_id = result_trees.rt_id\n'
                '                JOIN tree_nodes\n'
                '                  ON result_trees.t_id = tree_nodes.t_id\n'
                '                  AND result_tree_nodes.sent_position = tree_nodes.sent_position\n'
                '                WHERE result_tree_nodes.rt_id = ?'),
                                        (result_tree_id, ))
            tree = HybridTree(name)
            for i, label, pos, head, deprel in tree_nodes:
                if deprel is None:
                    deprel = 'UNKNOWN'
                token = CoNLLToken(label, '_', pos, pos, '_', deprel)
                tree.add_node(str(i), token, True, True)
                if head == 0:
                    tree.add_to_root(str(i))
                else:
                    tree.add_child(str(head), str(i))
            assert tree.root is not []
            return status, tree
    # legacy: no entry found
    else:
        status = "simple_fallback"

    # Create a left branching tree without labels as default strategy
    tree_nodes = cursor.execute(
        ''' SELECT tree_nodes.sent_position, label, pos FROM tree_nodes
        WHERE tree_nodes.t_id = ?''', (tree_id, )).fetchall()

    left_branch = lambda x: x - 1
    right_branch = lambda x: x + 1
    strategy = left_branch

    length = len(tree_nodes)
    tree = HybridTree()
    for i, label, pos in tree_nodes:
        token = CoNLLToken(label, '_', pos, pos, '_', '_')
        tree.add_node(str(i), token, True, True)
        parent = strategy(i)
        if (parent == 0
                and strategy == left_branch) or (parent == length + 1
                                                 and strategy == right_branch):
            tree.add_to_root(str(i))
        else:
            tree.add_child(str(parent), str(i))
    assert tree.root is not []
    return status, tree
def parse_conll_corpus(path, ignore_punctuation, limit=sys.maxsize, start=0):
    """
    :param path: path to corpus
    :type: str
    :param ignore_punctuation: exclude punctuation from tree structure
    :type ignore_punctuation: bool
    :param limit: stop generation after limit trees
    :type: int
    :param start: start generation with start'th tree
    :type start: int
    :return: a series of hybrid trees read from file
    :rtype: __generator[HybridTree]
    :raise Exception: unexpected input in corpus file
    Lazily parses a dependency corpus (in CoNLL format) and generates GeneralHybridTrees.
    """

    # print path
    with open(path) as file_content:
        tree_count = 0

        while tree_count < limit:
            tree = None

            try:
                line = next(file_content)
                while line.startswith('#'):
                    line = next(file_content)
            except StopIteration:
                break

            match = CONLL_LINE.match(line)
            while match:
                if match.group(1) == '1':
                    tree_count += 1
                    tree = HybridTree('tree' + str(tree_count))

                node_id = match.group(1)
                form = match.group(2)
                lemma = match.group(3)
                cpos = match.group(4)
                pos = match.group(5)
                feats = match.group(6)
                parent = match.group(7)
                deprel = match.group(8)

                # We ignore information about multiple token's as present in the UD version of Prague Dep. TB
                if MULTI_TOKEN.search(node_id):
                  pass
                else:
                    # If punctuation is to be ignored, we
                    # remove it from the hybrid tree
                    # Punctuation according to definition
                    # cf. http://ilk.uvt.nl/conll/software.html#eval

                    # if not ignore_punctuation or form.translate(no_translation, string.punctuation):
                    tree.add_node(node_id, CoNLLToken(form, lemma, cpos, pos, feats, deprel), True, True)
                    if parent != '0':
                        tree.add_child(parent, node_id)
                    # else:
                    #    tree.add_node(node_id, CoNLLToken(form, lemma, pos, fine_grained_pos, feats, deprel), True, False)

                    # TODO: If punctuation is ignored and the root is punctuation,
                    # TODO: it is added to the tree anyhow.
                    if parent == '0':
                        tree.add_to_root(node_id)

                try:
                    line = next(file_content)
                    while line.startswith('#'):
                        line = next(file_content)
                    match = CONLL_LINE.search(line)
                except StopIteration:
                    line = ''
                    match = None

            # Assume empty line, otherwise raise exception
            match = EMPTY_LINE.match(line)
            if not match:
                raise Exception("Unexpected input in CoNLL corpus file.")

            if tree:
                # basic sanity checks
                if not tree.root:
                    # FIXME: ignoring punctuation may leads to malformed trees
                    print("non-rooted")
                    if ignore_punctuation:
                        continue
                    raise Exception
                    # elif root > 1:
                    # FIXME: turkish corpus contains trees with more than one root
                    # FIXME: currently, they are ignored
                    # continue
                elif tree.n_nodes() != len(tree.id_yield()) or len(tree.nodes()) != len(tree.full_yield()):
                    # FIXME: ignoring punctuation may leads to malformed trees
                    if ignore_punctuation:
                        continue
                    raise Exception(
                        '{4}: connected nodes: {0}, total nodes: {1}, full yield: {2}, connected yield: {3}'.format(
                            str(tree.n_nodes()), str(len(tree.nodes())), str(len(tree.full_yield())),
                            str(len(tree.id_yield())), tree.sent_label()))
                if tree_count > start:
                    yield tree
Beispiel #11
0
def parse_with_pgf(grammar, forms, poss, bin):
    """"
    :type grammar: PGF
    :return:
    :rtype:
    """
    lcfrs = grammar.languages[bin + 'grammargfconcrete']

    # sentence = "ADJD ADV _COMMA_ KOUS ADV PIS PROAV VVINF VMFIN _PUNCT_"
    sentence = ' '.join(map(escape, poss))

    try:
        i = lcfrs.parse(sentence, n=1)
        p, e = next(i)
    except (StopIteration, pgf.ParseError):
        return None

    # print_ast(gr, e, 0)
    s = lcfrs.graphvizParseTree(e)
    assert isinstance(s, str)
    s_ = s.splitlines()

    tree = HybridTree()

    # print s
    i = 0
    for line in s.splitlines():
        match = re.search(r'^\s*(n\d+)\[label="([^\s]+)"\]\s*$', line)
        if match:
            node_id = match.group(1)
            label = match.group(2)
            order = int(node_id[1:]) >= 100000
            if order:
                assert escape(poss[i]) == label
                tree.add_node(
                    node_id,
                    construct_constituent_token(form=forms[i],
                                                pos=poss[i],
                                                terminal=True), True)
                i += 1
            else:
                tree.add_node(
                    node_id,
                    construct_constituent_token(form=label,
                                                pos='_',
                                                terminal=False), False)
            # print node_id, label
            if label == 'VROOT1':
                tree.add_to_root(node_id)
            continue
        match = re.search(r'^  (n\d+) -- (n\d+)\s*$', line)
        if match:
            parent = match.group(1)
            child = match.group(2)
            tree.add_child(parent, child)
            # print line
            # print parent, child
            continue

    # print tree

    assert poss == [token.pos() for token in tree.token_yield()]
    # print the_yield

    dep_tree = HybridTree()
    head_table = defaultdict(lambda: None)
    attachment_point = defaultdict(lambda: None)
    for i, node in enumerate(tree.id_yield()):
        token = tree.node_token(node)
        dep_token = construct_conll_token(token.form(), un_escape(token.pos()))
        current = tree.parent(node)
        current = tree.parent(current)
        while current:
            current_label = tree.node_token(current).category()
            if not re.search(r'\d+X\d+$', current_label):
                s = un_escape(current_label)
                if s == 'TOP1':
                    s = 'ROOT1'
                dep_token.set_edge_label(s[:-1])
                head_table[current] = i + 1
                attachment_point[node] = current
                break
            else:
                current = tree.parent(current)
        dep_tree.add_node(i + 1, dep_token, order=True)

    # print head_table

    for node, dep_node in zip(tree.id_yield(), dep_tree.id_yield()):
        node = tree.parent(attachment_point[node])
        while node:
            if head_table[node]:
                dep_tree.add_child(head_table[node], dep_node)
                break
            node = tree.parent(node)
        if not node:
            dep_tree.add_to_root(dep_node)

    # print "dep_tree"
    # print dep_tree
    # print ' '.join(['(' + token.form() + '/' + token.deprel() + ')' for token in dep_tree.token_yield()])
    return dep_tree
class GeneralHybridTreeTestCase(unittest.TestCase):
    tree = None

    def setUp(self):
        self.tree = HybridTree()
        self.tree.add_node("v1", construct_conll_token("Piet", "NP"), True)
        self.tree.add_node("v21", construct_conll_token("Marie", "N"), True)
        self.tree.add_node("v", construct_conll_token("helpen", "VP"), True)
        self.tree.add_node("v2", construct_conll_token("lezen", "V"), True)
        self.tree.add_child("v", "v2")
        self.tree.add_child("v", "v1")
        self.tree.add_child("v2", "v21")
        self.tree.add_node("v3", construct_conll_token(".", "Punc"), True,
                           False)
        self.tree.add_to_root("v")

    def test_children(self):
        self.assertListEqual(self.tree.children('v'), ['v2', 'v1'])
        self.tree.reorder()
        self.assertListEqual(self.tree.children('v'), ['v1', 'v2'])

    def test_fringe(self):
        self.tree.reorder()
        self.assertListEqual(self.tree.fringe('v'), [2, 0, 3, 1])
        self.assertListEqual(self.tree.fringe('v2'), [3, 1])

    def test_n_spans(self):
        self.tree.reorder()
        self.assertEqual(self.tree.n_spans('v'), 1)
        self.assertEqual(self.tree.n_spans('v2'), 2)

    def test_n_gaps(self):
        self.tree.reorder()
        self.assertEqual(self.tree.n_gaps(), 1)

    def test_node_ids(self):
        self.tree.reorder()
        self.assertListEqual(sorted(self.tree.nodes()),
                             sorted(['v', 'v1', 'v2', 'v21', 'v3']))

    def test_complete(self):
        self.tree.reorder()
        self.assertEqual(self.tree.complete(), True)

    def test_unlabelled_structure(self):
        self.tree.reorder()
        self.assertTupleEqual(self.tree.unlabelled_structure(),
                              ({0, 1, 2, 3}, [({0}, []),
                                              ({1, 3}, [({1}, [])])]))

    def test_max_n_spans(self):
        self.tree.reorder()
        self.assertEqual(self.tree.max_n_spans(), 2)

    def test_labelled_yield(self):
        self.tree.reorder()
        self.assertListEqual(
            [token.form() for token in self.tree.token_yield()],
            "Piet Marie helpen lezen".split(' '))

    def test_full_labelled_yield(self):
        self.tree.reorder()
        self.assertListEqual(
            [token.form() for token in self.tree.full_token_yield()],
            "Piet Marie helpen lezen .".split(' '))

    def test_full_yield(self):
        self.tree.reorder()
        self.assertListEqual(self.tree.full_yield(),
                             'v1 v21 v v2 v3'.split(' '))

    # def test_labelled_spans(self):
    # self.tree.reorder()
    # self.assertListEqual(self.tree.labelled_spans(), [])

    def test_pos_yield(self):
        self.tree.reorder()
        self.assertListEqual(
            [token.pos() for token in self.tree.token_yield()],
            "NP N VP V".split(' '))

    def test_recursive_partitioning(self):
        self.tree.reorder()
        self.assertEqual(self.tree.recursive_partitioning(),
                         ({0, 1, 2, 3}, [({0}, []),
                                         ({1, 3}, [({1}, []), ({3}, [])]),
                                         ({2}, [])]))