Esempi in Python per Root.create_child, esempi in Python per udapi.core.root.Root.create_child

Esempio n. 1

0

Mostra file

File: test_node.py Progetto: udapi/udapi-python

    def test_print_subtree(self):
        """Test print_subtree() method, which uses udapi.block.write.textmodetrees."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu')
        doc.load_conllu(data_filename)
        root = doc.bundles[0].get_tree()

        expected1 = ("# sent_id = a-mf920901-001-p1s1A\n"
                     "# text = Slovenská ústava: pro i proti\n"
                     "─┮\n"
                     " │ ╭─╼ Slovenská ADJ amod\n"
                     " ╰─┾ ústava NOUN root\n"
                     "   ┡─╼ : PUNCT punct\n"
                     "   ╰─┮ pro ADP appos\n"
                     "     ┡─╼ i CONJ cc\n"
                     "     ╰─╼ proti ADP conj\n"
                     "\n")
        expected2 = ("─┮\n"
                     " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n"
                     " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n"
                     "   ┡─╼ : _ _\n"
                     "   ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n"
                     "     ┡─╼ i _ LId=i-1\n"
                     "     ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n"
                     "\n")

        # test non-projective tree
        root3 = Root()
        for i in range(1, 5):
            root3.create_child(form=str(i))
        nodes = root3.descendants(add_self=1)
        nodes[1].parent = nodes[3]
        nodes[4].parent = nodes[2]
        expected3 = ("─┮\n"
                     " │ ╭─╼ 1\n"
                     " ┡─╪───┮ 2\n"
                     " ╰─┶ 3 │\n"
                     "       ╰─╼ 4\n"
                     "\n")

        try:
            sys.stdout = capture = io.StringIO()
            root.print_subtree(color=False)
            self.assertEqual(capture.getvalue(), expected1)
            capture.seek(0)
            capture.truncate()
            root.print_subtree(color=False, attributes='form,feats,misc',
                               print_sent_id=False, print_text=False)
            self.assertEqual(capture.getvalue(), expected2)
            capture.seek(0)
            capture.truncate()
            root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0)
            self.assertEqual(capture.getvalue(), expected3)
        finally:
            sys.stdout = sys.__stdout__  # pylint: disable=redefined-variable-type

Esempio n. 2

0

Mostra file

    def test_deps_setter(self):
        """Test the deserialization of enhanced dependencies."""
        # Create a sample dependency tree.
        root = Root()
        for _ in range(3):
            root.create_child()

        nodes = root.descendants()
        nodes[0].deps.append({'parent': nodes[1], 'deprel': 'test'})

        self.assertEqual(nodes[0].raw_deps, '2:test')

Esempio n. 3

0

Mostra file

File: test_node.py Progetto: udapi/udapi-python

    def test_deps_setter(self):
        """Test the deserialization of enhanced dependencies."""
        # Create a sample dependency tree.
        root = Root()
        for _ in range(3):
            root.create_child()

        nodes = root.descendants()
        nodes[0].deps.append({'parent': nodes[1], 'deprel': 'test'})

        self.assertEqual(nodes[0].raw_deps, '2:test')

Esempio n. 4

0

Mostra file

File: conllu.py Progetto: Jankus1994/udapi-python

    def read_tree(self, document=None):
        if self.filehandle is None:
            return None

        root = Root()
        nodes = [root]
        parents = [0]
        mwts = []
        for line in self.filehandle:
            line = line.rstrip()
            if line == '':
                break
            if line[0] == '#':
                self.parse_comment_line(line, root)
            else:
                if self.separator == 'tab':
                    fields = line.split('\t')
                elif self.separator == 'space':
                    fields = line.split()
                elif self.separator == 'doublespace':
                    fields = re.split('  +', line)
                else:
                    raise ValueError('separator=%s is not valid' %
                                     self.separator)
                if len(fields) != len(self.node_attributes):
                    if self.strict:
                        raise RuntimeError('Wrong number of columns in %r' %
                                           line)
                    fields.extend(['_'] *
                                  (len(self.node_attributes) - len(fields)))
                # multi-word tokens will be processed later
                if '-' in fields[0]:
                    mwts.append(fields)
                    continue
                if '.' in fields[0]:
                    empty = root.create_empty_child(form=fields[1],
                                                    lemma=fields[2],
                                                    upos=fields[3],
                                                    xpos=fields[4],
                                                    feats=fields[5],
                                                    misc=fields[9])
                    empty.ord = fields[0]
                    empty.raw_deps = fields[8]  # TODO
                    continue

                node = root.create_child()

                # TODO slow implementation of speed-critical loading
                for (n_attribute,
                     attribute_name) in enumerate(self.node_attributes):
                    if attribute_name == 'head':
                        try:
                            parents.append(int(fields[n_attribute]))
                        except ValueError as exception:
                            if not self.strict and fields[n_attribute] == '_':
                                if self.empty_parent == 'warn':
                                    logging.warning(
                                        "Empty parent/head index in '%s'",
                                        line)
                                parents.append(0)
                            else:
                                raise exception
                    elif attribute_name == 'ord':
                        setattr(node, 'ord', int(fields[n_attribute]))
                    elif attribute_name == 'deps':
                        setattr(node, 'raw_deps', fields[n_attribute])
                    elif attribute_name != '_':
                        setattr(node, attribute_name, fields[n_attribute])

                nodes.append(node)

        # If no nodes were read from the filehandle (so only root remained in nodes),
        # we return None as a sign of failure (end of file or more than one empty line).
        if len(nodes) == 1:
            return None

        # Empty sentences are not allowed in CoNLL-U,
        # but if the users want to save just the sentence string and/or sent_id
        # they need to create one artificial node and mark it with Empty=Yes.
        # In that case, we will delete this node, so the tree will have just the (technical) root.
        # See also udapi.block.write.Conllu, which is compatible with this trick.
        if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes':
            nodes.pop()

        # Set dependency parents (now, all nodes of the tree are created).
        # TODO: parent setter checks for cycles, but this is something like O(n*log n)
        # if done for each node. It could be done faster if the whole tree is checked at once.
        # Also parent setter removes the node from its old parent's list of children,
        # this could be skipped here by not using `node = root.create_child()`.
        for node_ord, node in enumerate(nodes[1:], 1):
            try:
                node.parent = nodes[parents[node_ord]]
            except IndexError:
                raise ValueError("Node %s HEAD is out of range (%d)" %
                                 (node, parents[node_ord]))

        # Create multi-word tokens.
        for fields in mwts:
            range_start, range_end = fields[0].split('-')
            words = nodes[int(range_start):int(range_end) + 1]
            root.create_multiword_token(words, form=fields[1], misc=fields[-1])

        return root

Esempio n. 5

0

Mostra file

File: resegmentgold.py Progetto: mhshih/udapi-python

    def process_document(self, document):
        if not document.bundles:
            return
        pred_trees = self.extract_pred_trees(document)
        was_subroot = set()
        for pred_tree in pred_trees:
            for n in pred_tree.children:
                was_subroot.add(n)

        for bundle_no, bundle in enumerate(document.bundles):
            g_tree = bundle.trees[0]
            p_tree = pred_trees.pop()
            g_chars = ''.join(t.form for t in g_tree.token_descendants).replace(' ', '')
            p_chars = ''.join(t.form for t in p_tree.token_descendants).replace(' ', '')
            if g_chars == p_chars:
                bundle.add_tree(p_tree)
                continue

            # Make sure that p_tree contains enough nodes.
            moved_roots = []
            while len(p_chars) < len(g_chars):
                if not pred_trees:
                    raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars))
                new_p_tree = pred_trees.pop()
                p_chars += ''.join(t.form for t in new_p_tree.token_descendants).replace(' ', '')
                moved_roots.extend(new_p_tree.children)
                p_tree.steal_nodes(new_p_tree.descendants)
            self.choose_root(p_tree, was_subroot, g_tree)

            if not p_chars.startswith(g_chars):
                raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s'
                                 % (g_tree.sent_id, p_chars, g_chars))
            if g_chars == p_chars:
                bundle.add_tree(p_tree)
                continue

            # Now p_tree contains more nodes than it should.
            p_chars = ''
            tokens = p_tree.token_descendants
            for index, token in enumerate(tokens):
                p_chars += token.form.replace(' ', '')
                if len(p_chars) > len(g_chars):
                    logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id)
                    # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word
                    # of the sentence, resulting in "uklidnila.Komentář" in the raw text.
                    # It is not obvious how to fix this "properly", i.e. without increasing
                    # or decreasing the resulting LAS. The current solution is quite hacky.
                    if index + 1 == len(tokens):
                        next_p_tree = Root(zone=p_tree.zone)
                        pred_trees.append(next_p_tree)
                        next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):],
                                                 misc='Rehanged=Yes')
                        bundle.add_tree(p_tree)
                        break
                    else:
                        next_tok = tokens[index + 1]
                        next_tok.form = p_chars[len(g_chars):] + next_tok.form
                        p_chars = g_chars
                if len(p_chars) == len(g_chars):
                    next_p_tree = Root(zone=p_tree.zone)
                    words = []
                    for token in tokens[index + 1:]:
                        if isinstance(token, MWT):
                            words.extend(token.words)
                        else:
                            words.append(token)
                    for word in words:
                        if word in was_subroot:
                            del word.misc['Rehanged']
                        if word.parent is not p_tree and word.parent not in words:
                            if word.udeprel in FUNCTIONAL:
                                word.parent.misc['FuncChildMissing'] = 'Yes'
                        for child in word.children:
                            if child not in words and child.udeprel in FUNCTIONAL:
                                word.misc['FuncChildMissing'] = 'Yes'
                    next_p_tree.steal_nodes(words)
                    self.choose_root(p_tree, was_subroot, g_tree)
                    self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0])
                    pred_trees.append(next_p_tree)
                    bundle.add_tree(p_tree)
                    break

Esempio n. 6

0

Mostra file

    def test_print_subtree(self):
        """Test print_subtree() method, which uses udapi.block.write.textmodetrees."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data',
                                     'enh_deps.conllu')
        doc.load_conllu(data_filename)
        root = doc.bundles[0].get_tree()

        expected1 = ("# sent_id = a-mf920901-001-p1s1A\n"
                     "# text = Slovenská ústava: pro i proti\n"
                     "─┮\n"
                     " │ ╭─╼ Slovenská ADJ amod\n"
                     " ╰─┾ ústava NOUN root\n"
                     "   ┡─╼ : PUNCT punct\n"
                     "   ╰─┮ pro ADP appos\n"
                     "     ┡─╼ i CONJ cc\n"
                     "     ╰─╼ proti ADP conj\n"
                     "\n")
        expected2 = (
            "─┮\n"
            " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n"
            " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n"
            "   ┡─╼ : _ _\n"
            "   ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n"
            "     ┡─╼ i _ LId=i-1\n"
            "     ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n"
            "\n")

        # test non-projective tree
        root3 = Root()
        for i in range(1, 5):
            root3.create_child(form=str(i))
        nodes = root3.descendants(add_self=1)
        nodes[1].parent = nodes[3]
        nodes[4].parent = nodes[2]
        expected3 = ("─┮\n"
                     " │ ╭─╼ 1\n"
                     " ┡─╪───┮ 2\n"
                     " ╰─┶ 3 │\n"
                     "       ╰─╼ 4\n"
                     "\n")

        try:
            sys.stdout = capture = io.StringIO()
            root.print_subtree(color=False)
            self.assertEqual(capture.getvalue(), expected1)
            capture.seek(0)
            capture.truncate()
            root.print_subtree(color=False,
                               attributes='form,feats,misc',
                               print_sent_id=False,
                               print_text=False)
            self.assertEqual(capture.getvalue(), expected2)
            capture.seek(0)
            capture.truncate()
            root3.print_subtree(color=False,
                                attributes='form',
                                print_sent_id=0,
                                print_text=0)
            self.assertEqual(capture.getvalue(), expected3)
        finally:
            sys.stdout = sys.__stdout__  # pylint: disable=redefined-variable-type

Esempio n. 7

0

Mostra file

    def tokenize_tag_parse_tree(self,
                                root,
                                resegment=False,
                                tag=True,
                                parse=True):
        """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.

        If resegment=True, the returned list of Udapi trees may contain multiple trees.
        """
        if root.children:
            raise ValueError(
                'Tree already contained nodes before tokenization')

        # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions).
        self.tokenizer.setText(root.text)
        is_another = True
        u_sentences = []
        while is_another:
            u_sentence = Sentence()
            is_another = self.tokenizer.nextSentence(u_sentence)
            if is_another:
                u_sentences.append(u_sentence)

        # If resegmentation was not required, we need to join the segments.
        if not resegment and len(u_sentences) > 1:
            first_sent = u_sentences[0]
            n_words = first_sent.words.size() - 1
            for other_sent in u_sentences[1:]:
                other_words = other_sent.words.size() - 1
                for i in range(1, other_words + 1):
                    u_w = other_sent.words[i]
                    n_words += 1
                    u_w.id = n_words
                    first_sent.words.append(u_w)
            u_sentences = [first_sent]

        # tagging and parsing
        if tag:
            for u_sentence in u_sentences:
                self.tool.tag(u_sentence, Model.DEFAULT)
                if parse:
                    self.tool.parse(u_sentence, Model.DEFAULT)
        elif parse:
            raise ValueError(
                'Combination parse=True tag=False is not allowed.')

        # converting UDPipe nodes to Udapi nodes
        new_root = root
        trees = []
        for u_sentence in u_sentences:
            if not new_root:
                new_root = Root()
            heads, nodes = [], [new_root]
            u_words = u_sentence.words
            for i in range(1, u_words.size()):
                u_w = u_words[i]
                node = new_root.create_child(
                    form=u_w.form,
                    lemma=u_w.lemma,
                    upos=u_w.upostag,
                    xpos=u_w.xpostag,
                    feats=u_w.feats,
                    deprel=u_w.deprel,
                    misc=u_w.misc,
                )
                if parse:
                    heads.append(u_w.head)
                    nodes.append(node)
            if parse:
                for node in nodes[1:]:
                    head = heads.pop(0)
                    node.parent = nodes[head]
            trees.append(new_root)
            new_root = None
        return trees

Esempio n. 8

0

Mostra file

File: resegmentgold.py Progetto: udapi/udapi-python

    def process_document(self, document):
        if not document.bundles:
            return
        pred_trees = self.extract_pred_trees(document)
        was_subroot = set()
        for pred_tree in pred_trees:
            for n in pred_tree.children:
                was_subroot.add(n)

        for bundle_no, bundle in enumerate(document.bundles):
            g_tree = bundle.trees[0]
            p_tree = pred_trees.pop()
            g_chars = self._strip_spaces(''.join(t.form for t in g_tree.token_descendants))
            p_chars = self._strip_spaces(''.join(t.form for t in p_tree.token_descendants))
            g_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", g_chars))
            p_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", p_chars))
            if g_chars == p_chars:
                bundle.add_tree(p_tree)
                continue

            # Make sure that p_tree contains enough nodes.
            moved_roots = []
            while len(p_chars) < len(g_chars):
                if not pred_trees:
                    raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars))
                new_p_tree = pred_trees.pop()
                p_chars += self._strip_spaces(''.join(t.form for t in new_p_tree.token_descendants))
                moved_roots.extend(new_p_tree.children)
                p_tree.steal_nodes(new_p_tree.descendants)
            self.choose_root(p_tree, was_subroot, g_tree)

            if not p_chars.startswith(g_chars):
                raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s'
                                 % (g_tree.sent_id, p_chars, g_chars))
            if g_chars == p_chars:
                bundle.add_tree(p_tree)
                continue

            # Now p_tree contains more nodes than it should.
            p_chars = ''
            tokens = p_tree.token_descendants
            for index, token in enumerate(tokens):
                p_chars += self._strip_spaces(token.form)
                if len(p_chars) > len(g_chars):
                    logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id)
                    # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word
                    # of the sentence, resulting in "uklidnila.Komentář" in the raw text.
                    # It is not obvious how to fix this "properly", i.e. without increasing
                    # or decreasing the resulting LAS. The current solution is quite hacky.
                    if index + 1 == len(tokens):
                        next_p_tree = Root(zone=p_tree.zone)
                        pred_trees.append(next_p_tree)
                        next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):],
                                                 misc='Rehanged=Yes')
                        bundle.add_tree(p_tree)
                        break
                    else:
                        next_tok = tokens[index + 1]
                        next_tok.form = p_chars[len(g_chars):] + next_tok.form
                        p_chars = g_chars
                if len(p_chars) == len(g_chars):
                    next_p_tree = Root(zone=p_tree.zone)
                    words = []
                    for token in tokens[index + 1:]:
                        if isinstance(token, MWT):
                            words.extend(token.words)
                        else:
                            words.append(token)
                    for word in words:
                        if word in was_subroot:
                            del word.misc['Rehanged']
                        if word.parent is not p_tree and word.parent not in words:
                            if word.udeprel in FUNCTIONAL:
                                word.parent.misc['FuncChildMissing'] = 'Yes'
                        for child in word.children:
                            if child not in words and child.udeprel in FUNCTIONAL:
                                word.misc['FuncChildMissing'] = 'Yes'
                    next_p_tree.steal_nodes(words)
                    self.choose_root(p_tree, was_subroot, g_tree)
                    self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0])
                    pred_trees.append(next_p_tree)
                    bundle.add_tree(p_tree)
                    break