def read_tree(self, document=None):
     if self.filehandle is None:
         return None
     line = self.filehandle.readline()
     if line == '':
         return None
     root = Root()
     root.text = line.rstrip()
     return root
Exemple #2
0
    def test_deps_setter(self):
        """Test the deserialization of enhanced dependencies."""
        # Create a sample dependency tree.
        root = Root()
        for _ in range(3):
            root.create_child()

        nodes = root.descendants()
        nodes[0].deps.append({'parent': nodes[1], 'deprel': 'test'})

        self.assertEqual(nodes[0].raw_deps, '2:test')
Exemple #3
0
    def test_deps_setter(self):
        """Test the deserialization of enhanced dependencies."""
        # Create a sample dependency tree.
        root = Root()
        for _ in range(3):
            root.create_child()

        nodes = root.descendants()
        nodes[0].deps.append({'parent': nodes[1], 'deprel': 'test'})

        self.assertEqual(nodes[0].raw_deps, '2:test')
Exemple #4
0
    def read_tree(self, document=None):
        if self.filehandle is None:
            return None

        root = Root()
        parents = [0]
        words = []
        form = None
        for line in self.filehandle:
            line = line.rstrip()
            if line == '':
                break
            if line[0] == '#':
                # Are comments allowed in VISL-cg?
                continue

            if line[0].isspace():
                line.lstrip(line)
                node, parent_ord = self._node(line, root)
                words.append(node)
                parents.append(parent_ord)
            else:
                if words:
                    words[0].form = form
                    if len(words) > 1:
                        split_forms = form.split()
                        if len(words) == len(split_forms):
                            for word, split_form in zip(words, split_forms):
                                word.form = split_form
                        else:
                            for word in words[1:]:
                                word.form = '_'
                        root.create_multiword_token(words, form=form)
                    words = []
                form = line[2:-2]

        if words:
            words[0].form = form
            for word in words[1:]:
                word.form = '_'

        nodes = root.descendants(add_self=True)
        if len(nodes) == 1:
            return None
        for node_ord, node in enumerate(nodes[1:], 1):
            try:
                node.parent = nodes[parents[node_ord]]
            except IndexError:
                raise ValueError("Node %s HEAD is out of range (%d)" %
                                 (node, parents[node_ord]))

        return root
Exemple #5
0
    def read_tree(self):
        if self.filehandle is None:
            return None

        root = Root()
        parents = [0]
        words = []
        form = None
        for line in self.filehandle:
            line = line.rstrip()
            if line == '':
                break
            if line[0] == '#':
                root.comment += line[1:] + "\n"
                continue

            if line[0].isspace():
                node, parent_ord = self._node(line.lstrip(), root)
                words.append(node)
                parents.append(parent_ord)
                continue

            if words:
                words[0].form = form
                if len(words) > 1:
                    split_forms = form.split()
                    if len(words) == len(split_forms):
                        for word, split_form in zip(words, split_forms):
                            word.form = split_form
                    else:
                        for word in words[1:]:
                            word.form = '_'
                    root.create_multiword_token(words, form=form)
                words = []
            form = line[2:-2]

        if words:
            words[0].form = form
            for word in words[1:]:
                word.form = '_'

        nodes = root.descendants(add_self=True)
        if len(nodes) == 1:
            return None
        for node_ord, node in enumerate(nodes[1:], 1):
            try:
                node.parent = nodes[parents[node_ord]]
            except IndexError:
                raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord]))

        return root
Exemple #6
0
 def read_tree(self, document=None):
     if self.filehandle is None:
         return None
     line = self.filehandle.readline()
     # if readline() returns an empty string, the end of the file has been
     # reached, while a blank line is represented by '\n'
     # (or '\r\n' if reading a Windows file on Unix machine).
     if line == '':
         return None
     if self.ignore_empty_lines:
         while line in {'\n', '\r\n'}:
             line = self.filehandle.readline()
             if line == '':
                 return None
     root = Root()
     root.text = line.rstrip()
     return root
Exemple #7
0
 def read_tree(self, document=None):
     if self.filehandle is None:
         return None
     line = self.filehandle.readline()
     # if readline() returns an empty string, the end of the file has been
     # reached, while a blank line is represented by '\n'
     # (or '\r\n' if reading a Windows file on Unix machine).
     if line == '':
         return None
     if self.ignore_empty_lines:
         while line in {'\n', '\r\n'}:
             line = self.filehandle.readline()
             if line == '':
                 return None
     root = Root()
     root.text = line.rstrip(self.rstrip)
     return root
Exemple #8
0
    def test_print_subtree(self):
        """Test print_subtree() method, which uses udapi.block.write.textmodetrees."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu')
        doc.load_conllu(data_filename)
        root = doc.bundles[0].get_tree()

        expected1 = ("# sent_id = a-mf920901-001-p1s1A\n"
                     "# text = Slovenská ústava: pro i proti\n"
                     "─┮\n"
                     " │ ╭─╼ Slovenská ADJ amod\n"
                     " ╰─┾ ústava NOUN root\n"
                     "   ┡─╼ : PUNCT punct\n"
                     "   ╰─┮ pro ADP appos\n"
                     "     ┡─╼ i CONJ cc\n"
                     "     ╰─╼ proti ADP conj\n"
                     "\n")
        expected2 = ("─┮\n"
                     " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n"
                     " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n"
                     "   ┡─╼ : _ _\n"
                     "   ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n"
                     "     ┡─╼ i _ LId=i-1\n"
                     "     ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n"
                     "\n")

        # test non-projective tree
        root3 = Root()
        for i in range(1, 5):
            root3.create_child(form=str(i))
        nodes = root3.descendants(add_self=1)
        nodes[1].parent = nodes[3]
        nodes[4].parent = nodes[2]
        expected3 = ("─┮\n"
                     " │ ╭─╼ 1\n"
                     " ┡─╪───┮ 2\n"
                     " ╰─┶ 3 │\n"
                     "       ╰─╼ 4\n"
                     "\n")

        try:
            sys.stdout = capture = io.StringIO()
            root.print_subtree(color=False)
            self.assertEqual(capture.getvalue(), expected1)
            capture.seek(0)
            capture.truncate()
            root.print_subtree(color=False, attributes='form,feats,misc',
                               print_sent_id=False, print_text=False)
            self.assertEqual(capture.getvalue(), expected2)
            capture.seek(0)
            capture.truncate()
            root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0)
            self.assertEqual(capture.getvalue(), expected3)
        finally:
            sys.stdout = sys.__stdout__  # pylint: disable=redefined-variable-type
Exemple #9
0
    def process_document(self, document):
        if not document.bundles:
            return
        pred_trees = self.extract_pred_trees(document)
        was_subroot = set()
        for pred_tree in pred_trees:
            for n in pred_tree.children:
                was_subroot.add(n)

        for bundle_no, bundle in enumerate(document.bundles):
            g_tree = bundle.trees[0]
            p_tree = pred_trees.pop()
            g_chars = ''.join(t.form for t in g_tree.token_descendants).replace(' ', '')
            p_chars = ''.join(t.form for t in p_tree.token_descendants).replace(' ', '')
            if g_chars == p_chars:
                bundle.add_tree(p_tree)
                continue

            # Make sure that p_tree contains enough nodes.
            moved_roots = []
            while len(p_chars) < len(g_chars):
                if not pred_trees:
                    raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars))
                new_p_tree = pred_trees.pop()
                p_chars += ''.join(t.form for t in new_p_tree.token_descendants).replace(' ', '')
                moved_roots.extend(new_p_tree.children)
                p_tree.steal_nodes(new_p_tree.descendants)
            self.choose_root(p_tree, was_subroot, g_tree)

            if not p_chars.startswith(g_chars):
                raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s'
                                 % (g_tree.sent_id, p_chars, g_chars))
            if g_chars == p_chars:
                bundle.add_tree(p_tree)
                continue

            # Now p_tree contains more nodes than it should.
            p_chars = ''
            tokens = p_tree.token_descendants
            for index, token in enumerate(tokens):
                p_chars += token.form.replace(' ', '')
                if len(p_chars) > len(g_chars):
                    logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id)
                    # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word
                    # of the sentence, resulting in "uklidnila.Komentář" in the raw text.
                    # It is not obvious how to fix this "properly", i.e. without increasing
                    # or decreasing the resulting LAS. The current solution is quite hacky.
                    if index + 1 == len(tokens):
                        next_p_tree = Root(zone=p_tree.zone)
                        pred_trees.append(next_p_tree)
                        next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):],
                                                 misc='Rehanged=Yes')
                        bundle.add_tree(p_tree)
                        break
                    else:
                        next_tok = tokens[index + 1]
                        next_tok.form = p_chars[len(g_chars):] + next_tok.form
                        p_chars = g_chars
                if len(p_chars) == len(g_chars):
                    next_p_tree = Root(zone=p_tree.zone)
                    words = []
                    for token in tokens[index + 1:]:
                        if isinstance(token, MWT):
                            words.extend(token.words)
                        else:
                            words.append(token)
                    for word in words:
                        if word in was_subroot:
                            del word.misc['Rehanged']
                        if word.parent is not p_tree and word.parent not in words:
                            if word.udeprel in FUNCTIONAL:
                                word.parent.misc['FuncChildMissing'] = 'Yes'
                        for child in word.children:
                            if child not in words and child.udeprel in FUNCTIONAL:
                                word.misc['FuncChildMissing'] = 'Yes'
                    next_p_tree.steal_nodes(words)
                    self.choose_root(p_tree, was_subroot, g_tree)
                    self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0])
                    pred_trees.append(next_p_tree)
                    bundle.add_tree(p_tree)
                    break
Exemple #10
0
    def test_print_subtree(self):
        """Test print_subtree() method, which uses udapi.block.write.textmodetrees."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data',
                                     'enh_deps.conllu')
        doc.load_conllu(data_filename)
        root = doc.bundles[0].get_tree()

        expected1 = ("# sent_id = a-mf920901-001-p1s1A\n"
                     "# text = Slovenská ústava: pro i proti\n"
                     "─┮\n"
                     " │ ╭─╼ Slovenská ADJ amod\n"
                     " ╰─┾ ústava NOUN root\n"
                     "   ┡─╼ : PUNCT punct\n"
                     "   ╰─┮ pro ADP appos\n"
                     "     ┡─╼ i CONJ cc\n"
                     "     ╰─╼ proti ADP conj\n"
                     "\n")
        expected2 = (
            "─┮\n"
            " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n"
            " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n"
            "   ┡─╼ : _ _\n"
            "   ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n"
            "     ┡─╼ i _ LId=i-1\n"
            "     ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n"
            "\n")

        # test non-projective tree
        root3 = Root()
        for i in range(1, 5):
            root3.create_child(form=str(i))
        nodes = root3.descendants(add_self=1)
        nodes[1].parent = nodes[3]
        nodes[4].parent = nodes[2]
        expected3 = ("─┮\n"
                     " │ ╭─╼ 1\n"
                     " ┡─╪───┮ 2\n"
                     " ╰─┶ 3 │\n"
                     "       ╰─╼ 4\n"
                     "\n")

        try:
            sys.stdout = capture = io.StringIO()
            root.print_subtree(color=False)
            self.assertEqual(capture.getvalue(), expected1)
            capture.seek(0)
            capture.truncate()
            root.print_subtree(color=False,
                               attributes='form,feats,misc',
                               print_sent_id=False,
                               print_text=False)
            self.assertEqual(capture.getvalue(), expected2)
            capture.seek(0)
            capture.truncate()
            root3.print_subtree(color=False,
                                attributes='form',
                                print_sent_id=0,
                                print_text=0)
            self.assertEqual(capture.getvalue(), expected3)
        finally:
            sys.stdout = sys.__stdout__  # pylint: disable=redefined-variable-type
Exemple #11
0
    def read_tree_from_lines(self, lines):
        root = Root()
        nodes = [root]
        parents = [0]
        mwts = []
        for line in lines:
            if line[0] == '#':
                self.parse_comment_line(line, root)
            else:
                self.parse_node_line(line, root, nodes, parents, mwts)

        # If no nodes were read from the filehandle (so only root remained in nodes),
        # we return None as a sign of failure (end of file or more than one empty line).
        if len(nodes) == 1:
            return None

        # Empty sentences are not allowed in CoNLL-U,
        # but if the users want to save just the sentence string and/or sent_id
        # they need to create one artificial node and mark it with Empty=Yes.
        # In that case, we will delete this node, so the tree will have just the (technical) root.
        # See also udapi.block.write.Conllu, which is compatible with this trick.
        if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes':
            nodes.pop()
            root._children = []
            root._descendants = []

        # Set dependency parents (now, all nodes of the tree are created).
        for node_ord, node in enumerate(nodes[1:], 1):
            try:
                parent = nodes[parents[node_ord]]
            except IndexError:
                raise ValueError("Node %s HEAD is out of range (%d)" %
                                 (node, parents[node_ord]))
            if node is parent:
                if self.fix_cycles:
                    logging.warning(
                        "Ignoring a cycle (attaching to the root instead):\n%s",
                        node)
                    node._parent = root
                    root._children.append(node)
                else:
                    raise ValueError(
                        f"Detected a cycle: {node} attached to itself")
            elif node.children:
                climbing = parent._parent
                while climbing:
                    if climbing is node:
                        if self.fix_cycles:
                            logging.warning(
                                "Ignoring a cycle (attaching to the root instead):\n%s",
                                parent)
                            parent = root
                            break
                        else:
                            raise ValueError(f"Detected a cycle: {node}")
                    climbing = climbing._parent
            node._parent = parent
            parent._children.append(node)

        # Create multi-word tokens.
        for fields in mwts:
            range_start, range_end = fields[0].split('-')
            words = nodes[int(range_start):int(range_end) + 1]
            root.create_multiword_token(words, form=fields[1], misc=fields[-1])

        return root
Exemple #12
0
    def tokenize_tag_parse_tree(self,
                                root,
                                resegment=False,
                                tag=True,
                                parse=True):
        """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.

        If resegment=True, the returned list of Udapi trees may contain multiple trees.
        """
        if root.children:
            raise ValueError(
                'Tree already contained nodes before tokenization')

        # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions).
        self.tokenizer.setText(root.text)
        is_another = True
        u_sentences = []
        while is_another:
            u_sentence = Sentence()
            is_another = self.tokenizer.nextSentence(u_sentence)
            if is_another:
                u_sentences.append(u_sentence)

        # If resegmentation was not required, we need to join the segments.
        if not resegment and len(u_sentences) > 1:
            first_sent = u_sentences[0]
            n_words = first_sent.words.size() - 1
            for other_sent in u_sentences[1:]:
                other_words = other_sent.words.size() - 1
                for i in range(1, other_words + 1):
                    u_w = other_sent.words[i]
                    n_words += 1
                    u_w.id = n_words
                    first_sent.words.append(u_w)
            u_sentences = [first_sent]

        # tagging and parsing
        if tag:
            for u_sentence in u_sentences:
                self.tool.tag(u_sentence, Model.DEFAULT)
                if parse:
                    self.tool.parse(u_sentence, Model.DEFAULT)
        elif parse:
            raise ValueError(
                'Combination parse=True tag=False is not allowed.')

        # converting UDPipe nodes to Udapi nodes
        new_root = root
        trees = []
        for u_sentence in u_sentences:
            if not new_root:
                new_root = Root()
            heads, nodes = [], [new_root]
            u_words = u_sentence.words
            for i in range(1, u_words.size()):
                u_w = u_words[i]
                node = new_root.create_child(
                    form=u_w.form,
                    lemma=u_w.lemma,
                    upos=u_w.upostag,
                    xpos=u_w.xpostag,
                    feats=u_w.feats,
                    deprel=u_w.deprel,
                    misc=u_w.misc,
                )
                if parse:
                    heads.append(u_w.head)
                    nodes.append(node)
            if parse:
                for node in nodes[1:]:
                    head = heads.pop(0)
                    node.parent = nodes[head]
            trees.append(new_root)
            new_root = None
        return trees
Exemple #13
0
    def process_document(self, document):
        if not document.bundles:
            return
        pred_trees = self.extract_pred_trees(document)
        was_subroot = set()
        for pred_tree in pred_trees:
            for n in pred_tree.children:
                was_subroot.add(n)

        for bundle_no, bundle in enumerate(document.bundles):
            g_tree = bundle.trees[0]
            p_tree = pred_trees.pop()
            g_chars = self._strip_spaces(''.join(t.form for t in g_tree.token_descendants))
            p_chars = self._strip_spaces(''.join(t.form for t in p_tree.token_descendants))
            g_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", g_chars))
            p_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", p_chars))
            if g_chars == p_chars:
                bundle.add_tree(p_tree)
                continue

            # Make sure that p_tree contains enough nodes.
            moved_roots = []
            while len(p_chars) < len(g_chars):
                if not pred_trees:
                    raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars))
                new_p_tree = pred_trees.pop()
                p_chars += self._strip_spaces(''.join(t.form for t in new_p_tree.token_descendants))
                moved_roots.extend(new_p_tree.children)
                p_tree.steal_nodes(new_p_tree.descendants)
            self.choose_root(p_tree, was_subroot, g_tree)

            if not p_chars.startswith(g_chars):
                raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s'
                                 % (g_tree.sent_id, p_chars, g_chars))
            if g_chars == p_chars:
                bundle.add_tree(p_tree)
                continue

            # Now p_tree contains more nodes than it should.
            p_chars = ''
            tokens = p_tree.token_descendants
            for index, token in enumerate(tokens):
                p_chars += self._strip_spaces(token.form)
                if len(p_chars) > len(g_chars):
                    logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id)
                    # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word
                    # of the sentence, resulting in "uklidnila.Komentář" in the raw text.
                    # It is not obvious how to fix this "properly", i.e. without increasing
                    # or decreasing the resulting LAS. The current solution is quite hacky.
                    if index + 1 == len(tokens):
                        next_p_tree = Root(zone=p_tree.zone)
                        pred_trees.append(next_p_tree)
                        next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):],
                                                 misc='Rehanged=Yes')
                        bundle.add_tree(p_tree)
                        break
                    else:
                        next_tok = tokens[index + 1]
                        next_tok.form = p_chars[len(g_chars):] + next_tok.form
                        p_chars = g_chars
                if len(p_chars) == len(g_chars):
                    next_p_tree = Root(zone=p_tree.zone)
                    words = []
                    for token in tokens[index + 1:]:
                        if isinstance(token, MWT):
                            words.extend(token.words)
                        else:
                            words.append(token)
                    for word in words:
                        if word in was_subroot:
                            del word.misc['Rehanged']
                        if word.parent is not p_tree and word.parent not in words:
                            if word.udeprel in FUNCTIONAL:
                                word.parent.misc['FuncChildMissing'] = 'Yes'
                        for child in word.children:
                            if child not in words and child.udeprel in FUNCTIONAL:
                                word.misc['FuncChildMissing'] = 'Yes'
                    next_p_tree.steal_nodes(words)
                    self.choose_root(p_tree, was_subroot, g_tree)
                    self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0])
                    pred_trees.append(next_p_tree)
                    bundle.add_tree(p_tree)
                    break
Exemple #14
0
    def read_tree_from_lines(self, lines):
        root = Root()
        nodes = [root]
        parents = [0]
        mwts = []
        for line in lines:
            if line[0] == '#':
                self.parse_comment_line(line, root)
            else:
                fields = line.split('\t')
                if len(fields) != 10:
                    if self.strict:
                        raise RuntimeError('Wrong number of columns in %r' %
                                           line)
                    fields.extend(['_'] * (10 - len(fields)))
                # multi-word tokens will be processed later
                if '-' in fields[0]:
                    mwts.append(fields)
                    continue
                if '.' in fields[0]:
                    empty = root.create_empty_child(form=fields[1],
                                                    lemma=fields[2],
                                                    upos=fields[3],
                                                    xpos=fields[4],
                                                    feats=fields[5],
                                                    misc=fields[9])
                    empty.ord = fields[0]
                    empty.raw_deps = fields[8]  # TODO
                    continue

                if fields[3] == '_':
                    fields[3] = None
                if fields[4] == '_':
                    fields[4] = None
                if fields[7] == '_':
                    fields[7] = None

                # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc
                node = Node(root=root,
                            form=fields[1],
                            lemma=fields[2],
                            upos=fields[3],
                            xpos=fields[4],
                            feats=fields[5],
                            deprel=fields[7],
                            misc=fields[9])
                root._descendants.append(node)
                node._ord = int(fields[0])
                if fields[8] != '_':
                    node.raw_deps = fields[8]
                try:
                    parents.append(int(fields[6]))
                except ValueError as exception:
                    if not self.strict and fields[6] == '_':
                        if self.empty_parent == 'warn':
                            logging.warning("Empty parent/head index in '%s'",
                                            line)
                        parents.append(0)
                    else:
                        raise exception

                nodes.append(node)

        # If no nodes were read from the filehandle (so only root remained in nodes),
        # we return None as a sign of failure (end of file or more than one empty line).
        if len(nodes) == 1:
            return None

        # Empty sentences are not allowed in CoNLL-U,
        # but if the users want to save just the sentence string and/or sent_id
        # they need to create one artificial node and mark it with Empty=Yes.
        # In that case, we will delete this node, so the tree will have just the (technical) root.
        # See also udapi.block.write.Conllu, which is compatible with this trick.
        if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes':
            nodes.pop()
            root._children = []
            root._descendants = []

        # Set dependency parents (now, all nodes of the tree are created).
        for node_ord, node in enumerate(nodes[1:], 1):
            try:
                parent = nodes[parents[node_ord]]
            except IndexError:
                raise ValueError("Node %s HEAD is out of range (%d)" %
                                 (node, parents[node_ord]))
            if node is parent:
                if self.fix_cycles:
                    logging.warning(
                        "Ignoring a cycle (attaching to the root instead):\n%s",
                        node)
                    node._parent = root
                    root._children.append(node)
                else:
                    raise ValueError(
                        f"Detected a cycle: {node} attached to itself")
            elif node.children:
                climbing = parent._parent
                while climbing:
                    if climbing is node:
                        if self.fix_cycles:
                            logging.warning(
                                "Ignoring a cycle (attaching to the root instead):\n%s",
                                parent)
                            parent = root
                            break
                        else:
                            raise ValueError(f"Detected a cycle: {node}")
                    climbing = climbing._parent
            node._parent = parent
            parent._children.append(node)

        # Create multi-word tokens.
        for fields in mwts:
            range_start, range_end = fields[0].split('-')
            words = nodes[int(range_start):int(range_end) + 1]
            root.create_multiword_token(words, form=fields[1], misc=fields[-1])

        return root
 def create_tree(self, zone=None):
     """Return the root of a newly added tree with a given zone."""
     root = Root()
     root.zone = zone
     self.add_tree(root)
     return root
Exemple #16
0
    def read_tree(self, document=None):
        if self.filehandle is None:
            return None

        root = Root()
        nodes = [root]
        parents = [0]
        mwts = []
        for line in self.filehandle:
            line = line.rstrip()
            if line == '':
                break
            if line[0] == '#':
                self.parse_comment_line(line, root)
            else:
                if self.separator == 'tab':
                    fields = line.split('\t')
                elif self.separator == 'space':
                    fields = line.split()
                elif self.separator == 'doublespace':
                    fields = re.split('  +', line)
                else:
                    raise ValueError('separator=%s is not valid' %
                                     self.separator)
                if len(fields) != len(self.node_attributes):
                    if self.strict:
                        raise RuntimeError('Wrong number of columns in %r' %
                                           line)
                    fields.extend(['_'] *
                                  (len(self.node_attributes) - len(fields)))
                # multi-word tokens will be processed later
                if '-' in fields[0]:
                    mwts.append(fields)
                    continue
                if '.' in fields[0]:
                    empty = root.create_empty_child(form=fields[1],
                                                    lemma=fields[2],
                                                    upos=fields[3],
                                                    xpos=fields[4],
                                                    feats=fields[5],
                                                    misc=fields[9])
                    empty.ord = fields[0]
                    empty.raw_deps = fields[8]  # TODO
                    continue

                node = root.create_child()

                # TODO slow implementation of speed-critical loading
                for (n_attribute,
                     attribute_name) in enumerate(self.node_attributes):
                    if attribute_name == 'head':
                        try:
                            parents.append(int(fields[n_attribute]))
                        except ValueError as exception:
                            if not self.strict and fields[n_attribute] == '_':
                                if self.empty_parent == 'warn':
                                    logging.warning(
                                        "Empty parent/head index in '%s'",
                                        line)
                                parents.append(0)
                            else:
                                raise exception
                    elif attribute_name == 'ord':
                        setattr(node, 'ord', int(fields[n_attribute]))
                    elif attribute_name == 'deps':
                        setattr(node, 'raw_deps', fields[n_attribute])
                    elif attribute_name != '_':
                        setattr(node, attribute_name, fields[n_attribute])

                nodes.append(node)

        # If no nodes were read from the filehandle (so only root remained in nodes),
        # we return None as a sign of failure (end of file or more than one empty line).
        if len(nodes) == 1:
            return None

        # Empty sentences are not allowed in CoNLL-U,
        # but if the users want to save just the sentence string and/or sent_id
        # they need to create one artificial node and mark it with Empty=Yes.
        # In that case, we will delete this node, so the tree will have just the (technical) root.
        # See also udapi.block.write.Conllu, which is compatible with this trick.
        if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes':
            nodes.pop()

        # Set dependency parents (now, all nodes of the tree are created).
        # TODO: parent setter checks for cycles, but this is something like O(n*log n)
        # if done for each node. It could be done faster if the whole tree is checked at once.
        # Also parent setter removes the node from its old parent's list of children,
        # this could be skipped here by not using `node = root.create_child()`.
        for node_ord, node in enumerate(nodes[1:], 1):
            try:
                node.parent = nodes[parents[node_ord]]
            except IndexError:
                raise ValueError("Node %s HEAD is out of range (%d)" %
                                 (node, parents[node_ord]))

        # Create multi-word tokens.
        for fields in mwts:
            range_start, range_end = fields[0].split('-')
            words = nodes[int(range_start):int(range_end) + 1]
            root.create_multiword_token(words, form=fields[1], misc=fields[-1])

        return root
Exemple #17
0
    def process_document( self, document ):

        number_of_loaded_bundles = 0

        nodes = []
        comment = ''

        while number_of_loaded_bundles < self.bundles_per_document:

            # TODO: more or less cut'n'paste from document.py (in which it should be deleted)

            line = self.filehandle.readline()
            if line == '': # EOF
                self.finished = True
                return
                # TODO: the last processed bundle should be finished at this point (because of the guaranteed empty line), but it should be checked


            if re.search('^#',line):
                comment = comment + line

            elif re.search('^\d+\-',line):  # HACK: multiword tokens temporarily avoided                                                                                         
                pass

            elif line.strip():

                if not nodes:
                    bundle = Bundle()
                    document.bundles.append(bundle)
                    root = Root()
                    root._aux['comment'] = comment # TODO: ulozit nekam poradne
                    nodes = [root]
                    bundle.trees.append(root)

                columns = line.strip().split('\t')

                node = Node()
                nodes.append(node)

                columns.append(None)  # TODO: why was the last column missing in some files?

                for index in xrange(0,len(Document.attrnames)):
                    setattr( node, Document.attrnames[index], columns[index] )


                try:  # TODO: kde se v tomhle sloupecku berou podtrzitka
                    node.head = int(node.head)
                except ValueError:
                    node.head = 0

                try:   # TODO: poresit multitokeny
                    node.ord = int(node.ord)
                except ValueError:
                    pass # node.ord = 0                        


            else: # an empty line is guaranteed even after the last sentence in a conll-u file

                if len(nodes) == 0:
                    print "Warning: this is weird: probably two empty lines following each other" # TODO: resolve
                else:
#                    print "QQQ A tree completed, tree number "+str(number_of_loaded_bundles)
                    number_of_loaded_bundles += 1
                    nodes[0]._aux['descendants'] = nodes[1:]
                    for node in nodes[1:]:
                        node.set_parent( nodes[node.head] )
                    nodes = []
                    comment = ''

        return document