Beispiel #1
0
    def test_print_subtree(self):
        """Test print_subtree() method, which uses udapi.block.write.textmodetrees."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu')
        doc.load_conllu(data_filename)
        root = doc.bundles[0].get_tree()

        expected1 = ("# sent_id = a-mf920901-001-p1s1A\n"
                     "# text = Slovenská ústava: pro i proti\n"
                     "─┮\n"
                     " │ ╭─╼ Slovenská ADJ amod\n"
                     " ╰─┾ ústava NOUN root\n"
                     "   ┡─╼ : PUNCT punct\n"
                     "   ╰─┮ pro ADP appos\n"
                     "     ┡─╼ i CONJ cc\n"
                     "     ╰─╼ proti ADP conj\n"
                     "\n")
        expected2 = ("─┮\n"
                     " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n"
                     " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n"
                     "   ┡─╼ : _ _\n"
                     "   ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n"
                     "     ┡─╼ i _ LId=i-1\n"
                     "     ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n"
                     "\n")

        # test non-projective tree
        root3 = Root()
        for i in range(1, 5):
            root3.create_child(form=str(i))
        nodes = root3.descendants(add_self=1)
        nodes[1].parent = nodes[3]
        nodes[4].parent = nodes[2]
        expected3 = ("─┮\n"
                     " │ ╭─╼ 1\n"
                     " ┡─╪───┮ 2\n"
                     " ╰─┶ 3 │\n"
                     "       ╰─╼ 4\n"
                     "\n")

        try:
            sys.stdout = capture = io.StringIO()
            root.print_subtree(color=False)
            self.assertEqual(capture.getvalue(), expected1)
            capture.seek(0)
            capture.truncate()
            root.print_subtree(color=False, attributes='form,feats,misc',
                               print_sent_id=False, print_text=False)
            self.assertEqual(capture.getvalue(), expected2)
            capture.seek(0)
            capture.truncate()
            root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0)
            self.assertEqual(capture.getvalue(), expected3)
        finally:
            sys.stdout = sys.__stdout__  # pylint: disable=redefined-variable-type
Beispiel #2
0
def load():
    from udapi.core.document import Document
    load, read, write, text, relchain, save = [], [], [], [], [], []
    for _ in range(30):
        start = timeit.default_timer()
        document = Document()
        document.load_conllu('cs-ud-train-l.conllu')
        end = timeit.default_timer()
        load.append(end - start)

        start = timeit.default_timer()
        for bundle in document:
            for root in bundle:
                for node in root.descendants:
                    form_lemma = node.form + node.lemma
        end = timeit.default_timer()
        read.append(end - start)

        start = timeit.default_timer()
        for bundle in document:
            for root in bundle:
                chain = [n for n in root.descendants if n.deprel == "case" and n.parent.deprel == "nmod"]
        end = timeit.default_timer()
        relchain.append(end - start)

        start = timeit.default_timer()
        for bundle in document:
            for root in bundle:
                for node in root.descendants:
                    node.deprel = 'dep'
        end = timeit.default_timer()
        write.append(end - start)

        start = timeit.default_timer()
        for bundle in document:
            for root in bundle:
                root.compute_text()
        end = timeit.default_timer()
        text.append(end - start)

        start = timeit.default_timer()
        document.store_conllu('hello.conllu')
        end = timeit.default_timer()
        save.append(end - start)

    for x, y in [('load', load), ('read', read), ('write', write), ('text', text), ('relchain', relchain), ('save', save)]:
        print("{}\t{} +/- {}".format(x, round(np.mean(y), 2), round(np.std(y), 2)))
Beispiel #3
0
    def test_topology(self):
        """Test methods/properties descendants, children, prev_node, next_node, ord."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data',
                                     'enh_deps.conllu')
        doc.load_conllu(data_filename)
        self.assertEqual(len(doc.bundles), 1)
        root = doc.bundles[0].get_tree()
        nodes = root.descendants
        nodes2 = root.descendants()
        # descendants() and descendants should return the same sequence of nodes
        self.assertEqual(nodes, nodes2)
        self.assertEqual(len(nodes), 6)
        self.assertEqual(nodes[1].parent, root)
        self.assertEqual(nodes[2].root, root)
        self.assertEqual(len(nodes[1].descendants), 5)
        self.assertEqual(len(nodes[1].children), 3)
        self.assertEqual(len(nodes[1].children(add_self=True)), 4)
        self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)),
                         3)

        self.assertEqual(nodes[0].next_node, nodes[1])
        self.assertEqual(nodes[2].prev_node, nodes[1])
        self.assertEqual(nodes[5].next_node, None)
        self.assertEqual(root.prev_node, None)

        (common_ancestor,
         added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1])
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [])
        input_nodes = [nodes[2], nodes[4], nodes[5]]
        (common_ancestor,
         added_nodes) = find_minimal_common_treelet(*input_nodes)
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [nodes[1], nodes[3]])

        # ords and reorderings
        self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6])
        self.assertTrue(nodes[0].precedes(nodes[1]))
        self.assertTrue(nodes[0] < nodes[1])
        self.assertFalse(nodes[0] > nodes[1])
        self.assertTrue(nodes[0] <= nodes[0])
        nodes[0].shift_after_node(nodes[1])
        self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6])
        self.assertEqual([node.ord for node in root.descendants()],
                         [1, 2, 3, 4, 5, 6])
Beispiel #4
0
def extract_senseid_children_collocates(conllu_filename):
    D = Document()
    D.load_conllu(conllu_filename
                  )  #'Chinese_train_pos.xml.utf8.sentences.conllu.senseid')
    target_senseid_deprel_form_bundles = Vividict()  #defaultdict(dict)
    for bundle in D.bundles:
        setattr_words(bundle=bundle)
        node = bundle.get_tree()
        while node:
            target = node.form
            senseid = node.misc['senseid']
            if senseid:  # For a verb like 想, list all children of the sense node:
                for child in node.children:
                    if target_senseid_deprel_form_bundles[target][senseid][
                            child.deprel][child.form] == {}:
                        target_senseid_deprel_form_bundles[target][senseid][
                            child.deprel][child.form] = [bundle]
                    else:
                        target_senseid_deprel_form_bundles[target][senseid][
                            child.deprel][child.form].append(bundle)
            node = node.next_node
    # To convert back to a common dictionaryu instance:
    d = dict(target_senseid_deprel_form_bundles)
    for target, senseid_deprel_form_bundles in target_senseid_deprel_form_bundles.items(
    ):
        d[target] = dict(senseid_deprel_form_bundles)
        for senseid, deprel_form_bundles in senseid_deprel_form_bundles.items(
        ):
            d[target][senseid] = dict(deprel_form_bundles)
            for deprel, form_bundles in deprel_form_bundles.items():
                #d[target][senseid][deprel]=dict(form_bundles)
                sorted_form_bundles = sorted(
                    form_bundles.items(),
                    key=lambda form_bundles: len(form_bundles[1]),
                    reverse=True)
                d[target][senseid][deprel] = OrderedDict(sorted_form_bundles)
    return d
Beispiel #5
0
    def test_topology(self):
        """Test methods/properties descendants, children, prev_node, next_node, ord."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu')
        doc.load_conllu(data_filename)
        self.assertEqual(len(doc.bundles), 1)
        root = doc.bundles[0].get_tree()
        nodes = root.descendants
        nodes2 = root.descendants()
        # descendants() and descendants should return the same sequence of nodes
        self.assertEqual(nodes, nodes2)
        self.assertEqual(len(nodes), 6)
        self.assertEqual(nodes[1].parent, root)
        self.assertEqual(nodes[2].root, root)
        self.assertEqual(len(nodes[1].descendants), 5)
        self.assertEqual(len(nodes[1].children), 3)
        self.assertEqual(len(nodes[1].children(add_self=True)), 4)
        self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3)

        self.assertEqual(nodes[0].next_node, nodes[1])
        self.assertEqual(nodes[2].prev_node, nodes[1])
        self.assertEqual(nodes[5].next_node, None)
        self.assertEqual(root.prev_node, None)

        (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1])
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [])
        input_nodes = [nodes[2], nodes[4], nodes[5]]
        (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes)
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [nodes[1], nodes[3]])

        # ords and reorderings
        self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6])
        nodes[0].shift_after_node(nodes[1])
        self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6])
        self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])
Beispiel #6
0
def load():
    from udapi.core.document import Document
    document = Document()
    document.load_conllu('cs-ud-train-l.conllu')

    for bundle in document:
        for root in bundle:
            for node in root.descendants:
                form_lemma = node.form + node.lemma

    for bundle in document:
        for root in bundle:
            chain = [n for n in root.descendants if n.parent.deprel == "det" and n.parent.parent.deprel == "obj"]

    for bundle in document:
        for root in bundle:
            for node in root.descendants:
                node.deprel = 'dep'

    for bundle in document:
        for root in bundle:
            root.compute_text()

    document.store_conllu('hello.conllu')
Beispiel #7
0
from udapi.core.document import Document

D = Document()
D.load_conllu(
    'SemEval-2007/Chinese_train_pos.xml.utf8.sentences.conllu.senseid')

for bundle in D.bundles:
    bundle.words = []
    node = bundle.get_tree()
    while node:
        bundle.words.append(node.form)
        node = node.next_node
    print(bundle.bundle_id, bundle.words)
Beispiel #8
0
    def test_print_subtree(self):
        """Test print_subtree() method, which uses udapi.block.write.textmodetrees."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data',
                                     'enh_deps.conllu')
        doc.load_conllu(data_filename)
        root = doc.bundles[0].get_tree()

        expected1 = ("# sent_id = a-mf920901-001-p1s1A\n"
                     "# text = Slovenská ústava: pro i proti\n"
                     "─┮\n"
                     " │ ╭─╼ Slovenská ADJ amod\n"
                     " ╰─┾ ústava NOUN root\n"
                     "   ┡─╼ : PUNCT punct\n"
                     "   ╰─┮ pro ADP appos\n"
                     "     ┡─╼ i CONJ cc\n"
                     "     ╰─╼ proti ADP conj\n"
                     "\n")
        expected2 = (
            "─┮\n"
            " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n"
            " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n"
            "   ┡─╼ : _ _\n"
            "   ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n"
            "     ┡─╼ i _ LId=i-1\n"
            "     ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n"
            "\n")

        # test non-projective tree
        root3 = Root()
        for i in range(1, 5):
            root3.create_child(form=str(i))
        nodes = root3.descendants(add_self=1)
        nodes[1].parent = nodes[3]
        nodes[4].parent = nodes[2]
        expected3 = ("─┮\n"
                     " │ ╭─╼ 1\n"
                     " ┡─╪───┮ 2\n"
                     " ╰─┶ 3 │\n"
                     "       ╰─╼ 4\n"
                     "\n")

        try:
            sys.stdout = capture = io.StringIO()
            root.print_subtree(color=False)
            self.assertEqual(capture.getvalue(), expected1)
            capture.seek(0)
            capture.truncate()
            root.print_subtree(color=False,
                               attributes='form,feats,misc',
                               print_sent_id=False,
                               print_text=False)
            self.assertEqual(capture.getvalue(), expected2)
            capture.seek(0)
            capture.truncate()
            root3.print_subtree(color=False,
                                attributes='form',
                                print_sent_id=0,
                                print_text=0)
            self.assertEqual(capture.getvalue(), expected3)
        finally:
            sys.stdout = sys.__stdout__  # pylint: disable=redefined-variable-type