def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip() return root
def test_deps_setter(self): """Test the deserialization of enhanced dependencies.""" # Create a sample dependency tree. root = Root() for _ in range(3): root.create_child() nodes = root.descendants() nodes[0].deps.append({'parent': nodes[1], 'deprel': 'test'}) self.assertEqual(nodes[0].raw_deps, '2:test')
def read_tree(self, document=None): if self.filehandle is None: return None root = Root() parents = [0] words = [] form = None for line in self.filehandle: line = line.rstrip() if line == '': break if line[0] == '#': # Are comments allowed in VISL-cg? continue if line[0].isspace(): line.lstrip(line) node, parent_ord = self._node(line, root) words.append(node) parents.append(parent_ord) else: if words: words[0].form = form if len(words) > 1: split_forms = form.split() if len(words) == len(split_forms): for word, split_form in zip(words, split_forms): word.form = split_form else: for word in words[1:]: word.form = '_' root.create_multiword_token(words, form=form) words = [] form = line[2:-2] if words: words[0].form = form for word in words[1:]: word.form = '_' nodes = root.descendants(add_self=True) if len(nodes) == 1: return None for node_ord, node in enumerate(nodes[1:], 1): try: node.parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) return root
def read_tree(self): if self.filehandle is None: return None root = Root() parents = [0] words = [] form = None for line in self.filehandle: line = line.rstrip() if line == '': break if line[0] == '#': root.comment += line[1:] + "\n" continue if line[0].isspace(): node, parent_ord = self._node(line.lstrip(), root) words.append(node) parents.append(parent_ord) continue if words: words[0].form = form if len(words) > 1: split_forms = form.split() if len(words) == len(split_forms): for word, split_form in zip(words, split_forms): word.form = split_form else: for word in words[1:]: word.form = '_' root.create_multiword_token(words, form=form) words = [] form = line[2:-2] if words: words[0].form = form for word in words[1:]: word.form = '_' nodes = root.descendants(add_self=True) if len(nodes) == 1: return None for node_ord, node in enumerate(nodes[1:], 1): try: node.parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) return root
def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() # if readline() returns an empty string, the end of the file has been # reached, while a blank line is represented by '\n' # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None if self.ignore_empty_lines: while line in {'\n', '\r\n'}: line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip() return root
def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() # if readline() returns an empty string, the end of the file has been # reached, while a blank line is represented by '\n' # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None if self.ignore_empty_lines: while line in {'\n', '\r\n'}: line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip(self.rstrip) return root
def test_print_subtree(self): """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) root = doc.bundles[0].get_tree() expected1 = ("# sent_id = a-mf920901-001-p1s1A\n" "# text = Slovenská ústava: pro i proti\n" "─┮\n" " │ ╭─╼ Slovenská ADJ amod\n" " ╰─┾ ústava NOUN root\n" " ┡─╼ : PUNCT punct\n" " ╰─┮ pro ADP appos\n" " ┡─╼ i CONJ cc\n" " ╰─╼ proti ADP conj\n" "\n") expected2 = ("─┮\n" " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n" " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n" " ┡─╼ : _ _\n" " ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n" " ┡─╼ i _ LId=i-1\n" " ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n" "\n") # test non-projective tree root3 = Root() for i in range(1, 5): root3.create_child(form=str(i)) nodes = root3.descendants(add_self=1) nodes[1].parent = nodes[3] nodes[4].parent = nodes[2] expected3 = ("─┮\n" " │ ╭─╼ 1\n" " ┡─╪───┮ 2\n" " ╰─┶ 3 │\n" " ╰─╼ 4\n" "\n") try: sys.stdout = capture = io.StringIO() root.print_subtree(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() root.print_subtree(color=False, attributes='form,feats,misc', print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type
def process_document(self, document): if not document.bundles: return pred_trees = self.extract_pred_trees(document) was_subroot = set() for pred_tree in pred_trees: for n in pred_tree.children: was_subroot.add(n) for bundle_no, bundle in enumerate(document.bundles): g_tree = bundle.trees[0] p_tree = pred_trees.pop() g_chars = ''.join(t.form for t in g_tree.token_descendants).replace(' ', '') p_chars = ''.join(t.form for t in p_tree.token_descendants).replace(' ', '') if g_chars == p_chars: bundle.add_tree(p_tree) continue # Make sure that p_tree contains enough nodes. moved_roots = [] while len(p_chars) < len(g_chars): if not pred_trees: raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars)) new_p_tree = pred_trees.pop() p_chars += ''.join(t.form for t in new_p_tree.token_descendants).replace(' ', '') moved_roots.extend(new_p_tree.children) p_tree.steal_nodes(new_p_tree.descendants) self.choose_root(p_tree, was_subroot, g_tree) if not p_chars.startswith(g_chars): raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s' % (g_tree.sent_id, p_chars, g_chars)) if g_chars == p_chars: bundle.add_tree(p_tree) continue # Now p_tree contains more nodes than it should. p_chars = '' tokens = p_tree.token_descendants for index, token in enumerate(tokens): p_chars += token.form.replace(' ', '') if len(p_chars) > len(g_chars): logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id) # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word # of the sentence, resulting in "uklidnila.Komentář" in the raw text. # It is not obvious how to fix this "properly", i.e. without increasing # or decreasing the resulting LAS. The current solution is quite hacky. if index + 1 == len(tokens): next_p_tree = Root(zone=p_tree.zone) pred_trees.append(next_p_tree) next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):], misc='Rehanged=Yes') bundle.add_tree(p_tree) break else: next_tok = tokens[index + 1] next_tok.form = p_chars[len(g_chars):] + next_tok.form p_chars = g_chars if len(p_chars) == len(g_chars): next_p_tree = Root(zone=p_tree.zone) words = [] for token in tokens[index + 1:]: if isinstance(token, MWT): words.extend(token.words) else: words.append(token) for word in words: if word in was_subroot: del word.misc['Rehanged'] if word.parent is not p_tree and word.parent not in words: if word.udeprel in FUNCTIONAL: word.parent.misc['FuncChildMissing'] = 'Yes' for child in word.children: if child not in words and child.udeprel in FUNCTIONAL: word.misc['FuncChildMissing'] = 'Yes' next_p_tree.steal_nodes(words) self.choose_root(p_tree, was_subroot, g_tree) self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0]) pred_trees.append(next_p_tree) bundle.add_tree(p_tree) break
def test_print_subtree(self): """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) root = doc.bundles[0].get_tree() expected1 = ("# sent_id = a-mf920901-001-p1s1A\n" "# text = Slovenská ústava: pro i proti\n" "─┮\n" " │ ╭─╼ Slovenská ADJ amod\n" " ╰─┾ ústava NOUN root\n" " ┡─╼ : PUNCT punct\n" " ╰─┮ pro ADP appos\n" " ┡─╼ i CONJ cc\n" " ╰─╼ proti ADP conj\n" "\n") expected2 = ( "─┮\n" " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n" " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n" " ┡─╼ : _ _\n" " ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n" " ┡─╼ i _ LId=i-1\n" " ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n" "\n") # test non-projective tree root3 = Root() for i in range(1, 5): root3.create_child(form=str(i)) nodes = root3.descendants(add_self=1) nodes[1].parent = nodes[3] nodes[4].parent = nodes[2] expected3 = ("─┮\n" " │ ╭─╼ 1\n" " ┡─╪───┮ 2\n" " ╰─┶ 3 │\n" " ╰─╼ 4\n" "\n") try: sys.stdout = capture = io.StringIO() root.print_subtree(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() root.print_subtree(color=False, attributes='form,feats,misc', print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type
def read_tree_from_lines(self, lines): root = Root() nodes = [root] parents = [0] mwts = [] for line in lines: if line[0] == '#': self.parse_comment_line(line, root) else: self.parse_node_line(line, root, nodes, parents, mwts) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: return None # Empty sentences are not allowed in CoNLL-U, # but if the users want to save just the sentence string and/or sent_id # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() root._children = [] root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). for node_ord, node in enumerate(nodes[1:], 1): try: parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) if node is parent: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", node) node._parent = root root._children.append(node) else: raise ValueError( f"Detected a cycle: {node} attached to itself") elif node.children: climbing = parent._parent while climbing: if climbing is node: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", parent) parent = root break else: raise ValueError(f"Detected a cycle: {node}") climbing = climbing._parent node._parent = parent parent._children.append(node) # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root
def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ if root.children: raise ValueError( 'Tree already contained nodes before tokenization') # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) is_another = True u_sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: u_sentences.append(u_sentence) # If resegmentation was not required, we need to join the segments. if not resegment and len(u_sentences) > 1: first_sent = u_sentences[0] n_words = first_sent.words.size() - 1 for other_sent in u_sentences[1:]: other_words = other_sent.words.size() - 1 for i in range(1, other_words + 1): u_w = other_sent.words[i] n_words += 1 u_w.id = n_words first_sent.words.append(u_w) u_sentences = [first_sent] # tagging and parsing if tag: for u_sentence in u_sentences: self.tool.tag(u_sentence, Model.DEFAULT) if parse: self.tool.parse(u_sentence, Model.DEFAULT) elif parse: raise ValueError( 'Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes new_root = root trees = [] for u_sentence in u_sentences: if not new_root: new_root = Root() heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): u_w = u_words[i] node = new_root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, ) if parse: heads.append(u_w.head) nodes.append(node) if parse: for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head] trees.append(new_root) new_root = None return trees
def process_document(self, document): if not document.bundles: return pred_trees = self.extract_pred_trees(document) was_subroot = set() for pred_tree in pred_trees: for n in pred_tree.children: was_subroot.add(n) for bundle_no, bundle in enumerate(document.bundles): g_tree = bundle.trees[0] p_tree = pred_trees.pop() g_chars = self._strip_spaces(''.join(t.form for t in g_tree.token_descendants)) p_chars = self._strip_spaces(''.join(t.form for t in p_tree.token_descendants)) g_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", g_chars)) p_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", p_chars)) if g_chars == p_chars: bundle.add_tree(p_tree) continue # Make sure that p_tree contains enough nodes. moved_roots = [] while len(p_chars) < len(g_chars): if not pred_trees: raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars)) new_p_tree = pred_trees.pop() p_chars += self._strip_spaces(''.join(t.form for t in new_p_tree.token_descendants)) moved_roots.extend(new_p_tree.children) p_tree.steal_nodes(new_p_tree.descendants) self.choose_root(p_tree, was_subroot, g_tree) if not p_chars.startswith(g_chars): raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s' % (g_tree.sent_id, p_chars, g_chars)) if g_chars == p_chars: bundle.add_tree(p_tree) continue # Now p_tree contains more nodes than it should. p_chars = '' tokens = p_tree.token_descendants for index, token in enumerate(tokens): p_chars += self._strip_spaces(token.form) if len(p_chars) > len(g_chars): logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id) # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word # of the sentence, resulting in "uklidnila.Komentář" in the raw text. # It is not obvious how to fix this "properly", i.e. without increasing # or decreasing the resulting LAS. The current solution is quite hacky. if index + 1 == len(tokens): next_p_tree = Root(zone=p_tree.zone) pred_trees.append(next_p_tree) next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):], misc='Rehanged=Yes') bundle.add_tree(p_tree) break else: next_tok = tokens[index + 1] next_tok.form = p_chars[len(g_chars):] + next_tok.form p_chars = g_chars if len(p_chars) == len(g_chars): next_p_tree = Root(zone=p_tree.zone) words = [] for token in tokens[index + 1:]: if isinstance(token, MWT): words.extend(token.words) else: words.append(token) for word in words: if word in was_subroot: del word.misc['Rehanged'] if word.parent is not p_tree and word.parent not in words: if word.udeprel in FUNCTIONAL: word.parent.misc['FuncChildMissing'] = 'Yes' for child in word.children: if child not in words and child.udeprel in FUNCTIONAL: word.misc['FuncChildMissing'] = 'Yes' next_p_tree.steal_nodes(words) self.choose_root(p_tree, was_subroot, g_tree) self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0]) pred_trees.append(next_p_tree) bundle.add_tree(p_tree) break
def read_tree_from_lines(self, lines): root = Root() nodes = [root] parents = [0] mwts = [] for line in lines: if line[0] == '#': self.parse_comment_line(line, root) else: fields = line.split('\t') if len(fields) != 10: if self.strict: raise RuntimeError('Wrong number of columns in %r' % line) fields.extend(['_'] * (10 - len(fields))) # multi-word tokens will be processed later if '-' in fields[0]: mwts.append(fields) continue if '.' in fields[0]: empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], misc=fields[9]) empty.ord = fields[0] empty.raw_deps = fields[8] # TODO continue if fields[3] == '_': fields[3] = None if fields[4] == '_': fields[4] = None if fields[7] == '_': fields[7] = None # ord,form,lemma,upos,xpos,feats,head,deprel,deps,misc node = Node(root=root, form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], deprel=fields[7], misc=fields[9]) root._descendants.append(node) node._ord = int(fields[0]) if fields[8] != '_': node.raw_deps = fields[8] try: parents.append(int(fields[6])) except ValueError as exception: if not self.strict and fields[6] == '_': if self.empty_parent == 'warn': logging.warning("Empty parent/head index in '%s'", line) parents.append(0) else: raise exception nodes.append(node) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: return None # Empty sentences are not allowed in CoNLL-U, # but if the users want to save just the sentence string and/or sent_id # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes': nodes.pop() root._children = [] root._descendants = [] # Set dependency parents (now, all nodes of the tree are created). for node_ord, node in enumerate(nodes[1:], 1): try: parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) if node is parent: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", node) node._parent = root root._children.append(node) else: raise ValueError( f"Detected a cycle: {node} attached to itself") elif node.children: climbing = parent._parent while climbing: if climbing is node: if self.fix_cycles: logging.warning( "Ignoring a cycle (attaching to the root instead):\n%s", parent) parent = root break else: raise ValueError(f"Detected a cycle: {node}") climbing = climbing._parent node._parent = parent parent._children.append(node) # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root
def create_tree(self, zone=None): """Return the root of a newly added tree with a given zone.""" root = Root() root.zone = zone self.add_tree(root) return root
def read_tree(self, document=None): if self.filehandle is None: return None root = Root() nodes = [root] parents = [0] mwts = [] for line in self.filehandle: line = line.rstrip() if line == '': break if line[0] == '#': self.parse_comment_line(line, root) else: if self.separator == 'tab': fields = line.split('\t') elif self.separator == 'space': fields = line.split() elif self.separator == 'doublespace': fields = re.split(' +', line) else: raise ValueError('separator=%s is not valid' % self.separator) if len(fields) != len(self.node_attributes): if self.strict: raise RuntimeError('Wrong number of columns in %r' % line) fields.extend(['_'] * (len(self.node_attributes) - len(fields))) # multi-word tokens will be processed later if '-' in fields[0]: mwts.append(fields) continue if '.' in fields[0]: empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], misc=fields[9]) empty.ord = fields[0] empty.raw_deps = fields[8] # TODO continue node = root.create_child() # TODO slow implementation of speed-critical loading for (n_attribute, attribute_name) in enumerate(self.node_attributes): if attribute_name == 'head': try: parents.append(int(fields[n_attribute])) except ValueError as exception: if not self.strict and fields[n_attribute] == '_': if self.empty_parent == 'warn': logging.warning( "Empty parent/head index in '%s'", line) parents.append(0) else: raise exception elif attribute_name == 'ord': setattr(node, 'ord', int(fields[n_attribute])) elif attribute_name == 'deps': setattr(node, 'raw_deps', fields[n_attribute]) elif attribute_name != '_': setattr(node, attribute_name, fields[n_attribute]) nodes.append(node) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: return None # Empty sentences are not allowed in CoNLL-U, # but if the users want to save just the sentence string and/or sent_id # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes': nodes.pop() # Set dependency parents (now, all nodes of the tree are created). # TODO: parent setter checks for cycles, but this is something like O(n*log n) # if done for each node. It could be done faster if the whole tree is checked at once. # Also parent setter removes the node from its old parent's list of children, # this could be skipped here by not using `node = root.create_child()`. for node_ord, node in enumerate(nodes[1:], 1): try: node.parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root
def process_document( self, document ): number_of_loaded_bundles = 0 nodes = [] comment = '' while number_of_loaded_bundles < self.bundles_per_document: # TODO: more or less cut'n'paste from document.py (in which it should be deleted) line = self.filehandle.readline() if line == '': # EOF self.finished = True return # TODO: the last processed bundle should be finished at this point (because of the guaranteed empty line), but it should be checked if re.search('^#',line): comment = comment + line elif re.search('^\d+\-',line): # HACK: multiword tokens temporarily avoided pass elif line.strip(): if not nodes: bundle = Bundle() document.bundles.append(bundle) root = Root() root._aux['comment'] = comment # TODO: ulozit nekam poradne nodes = [root] bundle.trees.append(root) columns = line.strip().split('\t') node = Node() nodes.append(node) columns.append(None) # TODO: why was the last column missing in some files? for index in xrange(0,len(Document.attrnames)): setattr( node, Document.attrnames[index], columns[index] ) try: # TODO: kde se v tomhle sloupecku berou podtrzitka node.head = int(node.head) except ValueError: node.head = 0 try: # TODO: poresit multitokeny node.ord = int(node.ord) except ValueError: pass # node.ord = 0 else: # an empty line is guaranteed even after the last sentence in a conll-u file if len(nodes) == 0: print "Warning: this is weird: probably two empty lines following each other" # TODO: resolve else: # print "QQQ A tree completed, tree number "+str(number_of_loaded_bundles) number_of_loaded_bundles += 1 nodes[0]._aux['descendants'] = nodes[1:] for node in nodes[1:]: node.set_parent( nodes[node.head] ) nodes = [] comment = '' return document