def read_tree_from_lines(self, lines): root = Root() nodes = [root] parents = [0] mwts = [] for line in lines: if line[0] == '#': self.parse_comment_line(line, root) else: self.parse_node_line(line, root, nodes, parents, mwts) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: return None # Empty sentences are not allowed in CoNLL-U, # but if the users want to save just the sentence string and/or sent_id # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() root._children = [] root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). for node_ord, node in enumerate(nodes[1:], 1): try: parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) if node is parent: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", node) node._parent = root root._children.append(node) else: raise ValueError( f"Detected a cycle: {node} attached to itself") elif node.children: climbing = parent._parent while climbing: if climbing is node: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", parent) parent = root break else: raise ValueError(f"Detected a cycle: {node}") climbing = climbing._parent node._parent = parent parent._children.append(node) # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root
def read_tree_from_lines(self, lines): root = Root() nodes = [root] parents = [0] mwts = [] for line in lines: if line[0] == '#': self.parse_comment_line(line, root) else: fields = line.split('\t') if len(fields) != 10: if self.strict: raise RuntimeError('Wrong number of columns in %r' % line) fields.extend(['_'] * (10 - len(fields))) # multi-word tokens will be processed later if '-' in fields[0]: mwts.append(fields) continue if '.' in fields[0]: empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], misc=fields[9]) empty.ord = fields[0] empty.raw_deps = fields[8] # TODO continue if fields[3] == '_': fields[3] = None if fields[4] == '_': fields[4] = None if fields[7] == '_': fields[7] = None # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9]) root._descendants.append(node) node._ord = int(fields[0]) if fields[8] != '_': node.raw_deps = fields[8] try: parents.append(int(fields[6])) except ValueError as exception: if not self.strict and fields[6] == '_': if self.empty_parent == 'warn': logging.warning("Empty parent/head index in '%s'", line) parents.append(0) else: raise exception nodes.append(node) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: return None # Empty sentences are not allowed in CoNLL-U, # but if the users want to save just the sentence string and/or sent_id # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() root._children = [] root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). for node_ord, node in enumerate(nodes[1:], 1): try: parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) if node is parent: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", node) node._parent = root root._children.append(node) else: raise ValueError( f"Detected a cycle: {node} attached to itself") elif node.children: climbing = parent._parent while climbing: if climbing is node: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", parent) parent = root break else: raise ValueError(f"Detected a cycle: {node}") climbing = climbing._parent node._parent = parent parent._children.append(node) # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root
def read_tree(self): if self.filehandle is None: return None root = Root() nodes = [root] parents = [0] mwts = [] for line in self.filehandle: line = line.rstrip() if line == '': break if line[0] == '#': self.parse_comment_line(line, root) else: if self.separator == 'tab': fields = line.split('\t') elif self.separator == 'space': fields = line.split() elif self.separator == 'doublespace': fields = re.split(' +', line) else: raise ValueError('separator=%s is not valid' % self.separator) if len(fields) != len(self.node_attributes): if self.strict: raise RuntimeError('Wrong number of columns in %r' % line) fields.extend(['_'] * (len(self.node_attributes) - len(fields))) # multi-word tokens will be processed later if '-' in fields[0]: mwts.append(fields) continue if '.' in fields[0]: empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], misc=fields[9]) empty.ord = fields[0] empty.raw_deps = fields[8] # TODO continue node = root.create_child() # TODO slow implementation of speed-critical loading for (n_attribute, attribute_name) in enumerate(self.node_attributes): if attribute_name == 'head': try: parents.append(int(fields[n_attribute])) except ValueError as exception: if not self.strict and fields[n_attribute] == '_': if self.empty_parent == 'warn': logging.warning( "Empty parent/head index in '%s'", line) parents.append(0) else: raise exception elif attribute_name == 'ord': setattr(node, 'ord', int(fields[n_attribute])) elif attribute_name == 'deps': setattr(node, 'raw_deps', fields[n_attribute]) elif attribute_name != '_': setattr(node, attribute_name, fields[n_attribute]) nodes.append(node) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: return None # Empty sentences are not allowed in CoNLL-U, # but if the users want to save just the sentence string and/or sent_id # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() root._children = [] root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). # TODO: parent setter checks for cycles, but this is something like O(n*log n) # if done for each node. It could be done faster if the whole tree is checked at once. # Also parent setter removes the node from its old parent's list of children, # this could be skipped here by not using `node = root.create_child()`. for node_ord, node in enumerate(nodes[1:], 1): try: node.parent = nodes[parents[node_ord]] # TODO add a special Exception class for cycles except ValueError as e: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", e) node.parent = root else: raise except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root