Beispiel #1
0
    def read_tree(self, document=None):
        if self.filehandle is None:
            return None

        root = Root()
        nodes = [root]
        parents = [0]
        mwts = []
        for line in self.filehandle:
            line = line.rstrip()
            if line == '':
                break
            if line[0] == '#':
                self.parse_comment_line(line, root)
            else:
                if self.separator == 'tab':
                    fields = line.split('\t')
                elif self.separator == 'space':
                    fields = line.split()
                elif self.separator == 'doublespace':
                    fields = re.split('  +', line)
                else:
                    raise ValueError('separator=%s is not valid' %
                                     self.separator)
                if len(fields) != len(self.node_attributes):
                    if self.strict:
                        raise RuntimeError('Wrong number of columns in %r' %
                                           line)
                    fields.extend(['_'] *
                                  (len(self.node_attributes) - len(fields)))
                # multi-word tokens will be processed later
                if '-' in fields[0]:
                    mwts.append(fields)
                    continue
                if '.' in fields[0]:
                    empty = root.create_empty_child(form=fields[1],
                                                    lemma=fields[2],
                                                    upos=fields[3],
                                                    xpos=fields[4],
                                                    feats=fields[5],
                                                    misc=fields[9])
                    empty.ord = fields[0]
                    empty.raw_deps = fields[8]  # TODO
                    continue

                node = root.create_child()

                # TODO slow implementation of speed-critical loading
                for (n_attribute,
                     attribute_name) in enumerate(self.node_attributes):
                    if attribute_name == 'head':
                        try:
                            parents.append(int(fields[n_attribute]))
                        except ValueError as exception:
                            if not self.strict and fields[n_attribute] == '_':
                                if self.empty_parent == 'warn':
                                    logging.warning(
                                        "Empty parent/head index in '%s'",
                                        line)
                                parents.append(0)
                            else:
                                raise exception
                    elif attribute_name == 'ord':
                        setattr(node, 'ord', int(fields[n_attribute]))
                    elif attribute_name == 'deps':
                        setattr(node, 'raw_deps', fields[n_attribute])
                    elif attribute_name != '_':
                        setattr(node, attribute_name, fields[n_attribute])

                nodes.append(node)

        # If no nodes were read from the filehandle (so only root remained in nodes),
        # we return None as a sign of failure (end of file or more than one empty line).
        if len(nodes) == 1:
            return None

        # Empty sentences are not allowed in CoNLL-U,
        # but if the users want to save just the sentence string and/or sent_id
        # they need to create one artificial node and mark it with Empty=Yes.
        # In that case, we will delete this node, so the tree will have just the (technical) root.
        # See also udapi.block.write.Conllu, which is compatible with this trick.
        if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes':
            nodes.pop()

        # Set dependency parents (now, all nodes of the tree are created).
        # TODO: parent setter checks for cycles, but this is something like O(n*log n)
        # if done for each node. It could be done faster if the whole tree is checked at once.
        # Also parent setter removes the node from its old parent's list of children,
        # this could be skipped here by not using `node = root.create_child()`.
        for node_ord, node in enumerate(nodes[1:], 1):
            try:
                node.parent = nodes[parents[node_ord]]
            except IndexError:
                raise ValueError("Node %s HEAD is out of range (%d)" %
                                 (node, parents[node_ord]))

        # Create multi-word tokens.
        for fields in mwts:
            range_start, range_end = fields[0].split('-')
            words = nodes[int(range_start):int(range_end) + 1]
            root.create_multiword_token(words, form=fields[1], misc=fields[-1])

        return root
Beispiel #2
0
    def read_tree_from_lines(self, lines):
        root = Root()
        nodes = [root]
        parents = [0]
        mwts = []
        for line in lines:
            if line[0] == '#':
                self.parse_comment_line(line, root)
            else:
                fields = line.split('\t')
                if len(fields) != 10:
                    if self.strict:
                        raise RuntimeError('Wrong number of columns in %r' %
                                           line)
                    fields.extend(['_'] * (10 - len(fields)))
                # multi-word tokens will be processed later
                if '-' in fields[0]:
                    mwts.append(fields)
                    continue
                if '.' in fields[0]:
                    empty = root.create_empty_child(form=fields[1],
                                                    lemma=fields[2],
                                                    upos=fields[3],
                                                    xpos=fields[4],
                                                    feats=fields[5],
                                                    misc=fields[9])
                    empty.ord = fields[0]
                    empty.raw_deps = fields[8]  # TODO
                    continue

                if fields[3] == '_':
                    fields[3] = None
                if fields[4] == '_':
                    fields[4] = None
                if fields[7] == '_':
                    fields[7] = None

                # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc
                node = Node(root=root,
                            form=fields[1],
                            lemma=fields[2],
                            upos=fields[3],
                            xpos=fields[4],
                            feats=fields[5],
                            deprel=fields[7],
                            misc=fields[9])
                root._descendants.append(node)
                node._ord = int(fields[0])
                if fields[8] != '_':
                    node.raw_deps = fields[8]
                try:
                    parents.append(int(fields[6]))
                except ValueError as exception:
                    if not self.strict and fields[6] == '_':
                        if self.empty_parent == 'warn':
                            logging.warning("Empty parent/head index in '%s'",
                                            line)
                        parents.append(0)
                    else:
                        raise exception

                nodes.append(node)

        # If no nodes were read from the filehandle (so only root remained in nodes),
        # we return None as a sign of failure (end of file or more than one empty line).
        if len(nodes) == 1:
            return None

        # Empty sentences are not allowed in CoNLL-U,
        # but if the users want to save just the sentence string and/or sent_id
        # they need to create one artificial node and mark it with Empty=Yes.
        # In that case, we will delete this node, so the tree will have just the (technical) root.
        # See also udapi.block.write.Conllu, which is compatible with this trick.
        if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes':
            nodes.pop()
            root._children = []
            root._descendants = []

        # Set dependency parents (now, all nodes of the tree are created).
        for node_ord, node in enumerate(nodes[1:], 1):
            try:
                parent = nodes[parents[node_ord]]
            except IndexError:
                raise ValueError("Node %s HEAD is out of range (%d)" %
                                 (node, parents[node_ord]))
            if node is parent:
                if self.fix_cycles:
                    logging.warning(
                        "Ignoring a cycle (attaching to the root instead):\n%s",
                        node)
                    node._parent = root
                    root._children.append(node)
                else:
                    raise ValueError(
                        f"Detected a cycle: {node} attached to itself")
            elif node.children:
                climbing = parent._parent
                while climbing:
                    if climbing is node:
                        if self.fix_cycles:
                            logging.warning(
                                "Ignoring a cycle (attaching to the root instead):\n%s",
                                parent)
                            parent = root
                            break
                        else:
                            raise ValueError(f"Detected a cycle: {node}")
                    climbing = climbing._parent
            node._parent = parent
            parent._children.append(node)

        # Create multi-word tokens.
        for fields in mwts:
            range_start, range_end = fields[0].split('-')
            words = nodes[int(range_start):int(range_end) + 1]
            root.create_multiword_token(words, form=fields[1], misc=fields[-1])

        return root