def test_print_subtree(self): """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) root = doc.bundles[0].get_tree() expected1 = ("# sent_id = a-mf920901-001-p1s1A\n" "# text = Slovenská ústava: pro i proti\n" "─┮\n" " │ ╭─╼ Slovenská ADJ amod\n" " ╰─┾ ústava NOUN root\n" " ┡─╼ : PUNCT punct\n" " ╰─┮ pro ADP appos\n" " ┡─╼ i CONJ cc\n" " ╰─╼ proti ADP conj\n" "\n") expected2 = ("─┮\n" " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n" " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n" " ┡─╼ : _ _\n" " ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n" " ┡─╼ i _ LId=i-1\n" " ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n" "\n") # test non-projective tree root3 = Root() for i in range(1, 5): root3.create_child(form=str(i)) nodes = root3.descendants(add_self=1) nodes[1].parent = nodes[3] nodes[4].parent = nodes[2] expected3 = ("─┮\n" " │ ╭─╼ 1\n" " ┡─╪───┮ 2\n" " ╰─┶ 3 │\n" " ╰─╼ 4\n" "\n") try: sys.stdout = capture = io.StringIO() root.print_subtree(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() root.print_subtree(color=False, attributes='form,feats,misc', print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type
def test_deps_setter(self): """Test the deserialization of enhanced dependencies.""" # Create a sample dependency tree. root = Root() for _ in range(3): root.create_child() nodes = root.descendants() nodes[0].deps.append({'parent': nodes[1], 'deprel': 'test'}) self.assertEqual(nodes[0].raw_deps, '2:test')
def read_tree(self, document=None): if self.filehandle is None: return None root = Root() nodes = [root] parents = [0] mwts = [] for line in self.filehandle: line = line.rstrip() if line == '': break if line[0] == '#': self.parse_comment_line(line, root) else: if self.separator == 'tab': fields = line.split('\t') elif self.separator == 'space': fields = line.split() elif self.separator == 'doublespace': fields = re.split(' +', line) else: raise ValueError('separator=%s is not valid' % self.separator) if len(fields) != len(self.node_attributes): if self.strict: raise RuntimeError('Wrong number of columns in %r' % line) fields.extend(['_'] * (len(self.node_attributes) - len(fields))) # multi-word tokens will be processed later if '-' in fields[0]: mwts.append(fields) continue if '.' in fields[0]: empty = root.create_empty_child(form=fields[1], lemma=fields[2], upos=fields[3], xpos=fields[4], feats=fields[5], misc=fields[9]) empty.ord = fields[0] empty.raw_deps = fields[8] # TODO continue node = root.create_child() # TODO slow implementation of speed-critical loading for (n_attribute, attribute_name) in enumerate(self.node_attributes): if attribute_name == 'head': try: parents.append(int(fields[n_attribute])) except ValueError as exception: if not self.strict and fields[n_attribute] == '_': if self.empty_parent == 'warn': logging.warning( "Empty parent/head index in '%s'", line) parents.append(0) else: raise exception elif attribute_name == 'ord': setattr(node, 'ord', int(fields[n_attribute])) elif attribute_name == 'deps': setattr(node, 'raw_deps', fields[n_attribute]) elif attribute_name != '_': setattr(node, attribute_name, fields[n_attribute]) nodes.append(node) # If no nodes were read from the filehandle (so only root remained in nodes), # we return None as a sign of failure (end of file or more than one empty line). if len(nodes) == 1: return None # Empty sentences are not allowed in CoNLL-U, # but if the users want to save just the sentence string and/or sent_id # they need to create one artificial node and mark it with Empty=Yes. # In that case, we will delete this node, so the tree will have just the (technical) root. # See also udapi.block.write.Conllu, which is compatible with this trick. if len(nodes) == 2 and nodes[1].misc == 'Empty=Yes': nodes.pop() # Set dependency parents (now, all nodes of the tree are created). # TODO: parent setter checks for cycles, but this is something like O(n*log n) # if done for each node. It could be done faster if the whole tree is checked at once. # Also parent setter removes the node from its old parent's list of children, # this could be skipped here by not using `node = root.create_child()`. for node_ord, node in enumerate(nodes[1:], 1): try: node.parent = nodes[parents[node_ord]] except IndexError: raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord])) # Create multi-word tokens. for fields in mwts: range_start, range_end = fields[0].split('-') words = nodes[int(range_start):int(range_end) + 1] root.create_multiword_token(words, form=fields[1], misc=fields[-1]) return root
def process_document(self, document): if not document.bundles: return pred_trees = self.extract_pred_trees(document) was_subroot = set() for pred_tree in pred_trees: for n in pred_tree.children: was_subroot.add(n) for bundle_no, bundle in enumerate(document.bundles): g_tree = bundle.trees[0] p_tree = pred_trees.pop() g_chars = ''.join(t.form for t in g_tree.token_descendants).replace(' ', '') p_chars = ''.join(t.form for t in p_tree.token_descendants).replace(' ', '') if g_chars == p_chars: bundle.add_tree(p_tree) continue # Make sure that p_tree contains enough nodes. moved_roots = [] while len(p_chars) < len(g_chars): if not pred_trees: raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars)) new_p_tree = pred_trees.pop() p_chars += ''.join(t.form for t in new_p_tree.token_descendants).replace(' ', '') moved_roots.extend(new_p_tree.children) p_tree.steal_nodes(new_p_tree.descendants) self.choose_root(p_tree, was_subroot, g_tree) if not p_chars.startswith(g_chars): raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s' % (g_tree.sent_id, p_chars, g_chars)) if g_chars == p_chars: bundle.add_tree(p_tree) continue # Now p_tree contains more nodes than it should. p_chars = '' tokens = p_tree.token_descendants for index, token in enumerate(tokens): p_chars += token.form.replace(' ', '') if len(p_chars) > len(g_chars): logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id) # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word # of the sentence, resulting in "uklidnila.Komentář" in the raw text. # It is not obvious how to fix this "properly", i.e. without increasing # or decreasing the resulting LAS. The current solution is quite hacky. if index + 1 == len(tokens): next_p_tree = Root(zone=p_tree.zone) pred_trees.append(next_p_tree) next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):], misc='Rehanged=Yes') bundle.add_tree(p_tree) break else: next_tok = tokens[index + 1] next_tok.form = p_chars[len(g_chars):] + next_tok.form p_chars = g_chars if len(p_chars) == len(g_chars): next_p_tree = Root(zone=p_tree.zone) words = [] for token in tokens[index + 1:]: if isinstance(token, MWT): words.extend(token.words) else: words.append(token) for word in words: if word in was_subroot: del word.misc['Rehanged'] if word.parent is not p_tree and word.parent not in words: if word.udeprel in FUNCTIONAL: word.parent.misc['FuncChildMissing'] = 'Yes' for child in word.children: if child not in words and child.udeprel in FUNCTIONAL: word.misc['FuncChildMissing'] = 'Yes' next_p_tree.steal_nodes(words) self.choose_root(p_tree, was_subroot, g_tree) self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0]) pred_trees.append(next_p_tree) bundle.add_tree(p_tree) break
def test_print_subtree(self): """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) root = doc.bundles[0].get_tree() expected1 = ("# sent_id = a-mf920901-001-p1s1A\n" "# text = Slovenská ústava: pro i proti\n" "─┮\n" " │ ╭─╼ Slovenská ADJ amod\n" " ╰─┾ ústava NOUN root\n" " ┡─╼ : PUNCT punct\n" " ╰─┮ pro ADP appos\n" " ┡─╼ i CONJ cc\n" " ╰─╼ proti ADP conj\n" "\n") expected2 = ( "─┮\n" " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n" " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n" " ┡─╼ : _ _\n" " ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n" " ┡─╼ i _ LId=i-1\n" " ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n" "\n") # test non-projective tree root3 = Root() for i in range(1, 5): root3.create_child(form=str(i)) nodes = root3.descendants(add_self=1) nodes[1].parent = nodes[3] nodes[4].parent = nodes[2] expected3 = ("─┮\n" " │ ╭─╼ 1\n" " ┡─╪───┮ 2\n" " ╰─┶ 3 │\n" " ╰─╼ 4\n" "\n") try: sys.stdout = capture = io.StringIO() root.print_subtree(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() root.print_subtree(color=False, attributes='form,feats,misc', print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type
def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ if root.children: raise ValueError( 'Tree already contained nodes before tokenization') # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) is_another = True u_sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: u_sentences.append(u_sentence) # If resegmentation was not required, we need to join the segments. if not resegment and len(u_sentences) > 1: first_sent = u_sentences[0] n_words = first_sent.words.size() - 1 for other_sent in u_sentences[1:]: other_words = other_sent.words.size() - 1 for i in range(1, other_words + 1): u_w = other_sent.words[i] n_words += 1 u_w.id = n_words first_sent.words.append(u_w) u_sentences = [first_sent] # tagging and parsing if tag: for u_sentence in u_sentences: self.tool.tag(u_sentence, Model.DEFAULT) if parse: self.tool.parse(u_sentence, Model.DEFAULT) elif parse: raise ValueError( 'Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes new_root = root trees = [] for u_sentence in u_sentences: if not new_root: new_root = Root() heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): u_w = u_words[i] node = new_root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, ) if parse: heads.append(u_w.head) nodes.append(node) if parse: for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head] trees.append(new_root) new_root = None return trees
def process_document(self, document): if not document.bundles: return pred_trees = self.extract_pred_trees(document) was_subroot = set() for pred_tree in pred_trees: for n in pred_tree.children: was_subroot.add(n) for bundle_no, bundle in enumerate(document.bundles): g_tree = bundle.trees[0] p_tree = pred_trees.pop() g_chars = self._strip_spaces(''.join(t.form for t in g_tree.token_descendants)) p_chars = self._strip_spaces(''.join(t.form for t in p_tree.token_descendants)) g_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", g_chars)) p_chars = ''.join(filter(lambda c: unicodedata.category(c) != "Zs", p_chars)) if g_chars == p_chars: bundle.add_tree(p_tree) continue # Make sure that p_tree contains enough nodes. moved_roots = [] while len(p_chars) < len(g_chars): if not pred_trees: raise ValueError('no pred_trees:\n%s\n%s' % (p_chars, g_chars)) new_p_tree = pred_trees.pop() p_chars += self._strip_spaces(''.join(t.form for t in new_p_tree.token_descendants)) moved_roots.extend(new_p_tree.children) p_tree.steal_nodes(new_p_tree.descendants) self.choose_root(p_tree, was_subroot, g_tree) if not p_chars.startswith(g_chars): raise ValueError('sent_id=%s: !p_chars.startswith(g_chars):\np_chars=%s\ng_chars=%s' % (g_tree.sent_id, p_chars, g_chars)) if g_chars == p_chars: bundle.add_tree(p_tree) continue # Now p_tree contains more nodes than it should. p_chars = '' tokens = p_tree.token_descendants for index, token in enumerate(tokens): p_chars += self._strip_spaces(token.form) if len(p_chars) > len(g_chars): logging.warning('Pred token crossing gold sentences: %s', g_tree.sent_id) # E.g. gold cs ln95048-151-p2s8 contains SpaceAfter=No on the last word # of the sentence, resulting in "uklidnila.Komentář" in the raw text. # It is not obvious how to fix this "properly", i.e. without increasing # or decreasing the resulting LAS. The current solution is quite hacky. if index + 1 == len(tokens): next_p_tree = Root(zone=p_tree.zone) pred_trees.append(next_p_tree) next_p_tree.create_child(deprel='wrong', form=p_chars[len(g_chars):], misc='Rehanged=Yes') bundle.add_tree(p_tree) break else: next_tok = tokens[index + 1] next_tok.form = p_chars[len(g_chars):] + next_tok.form p_chars = g_chars if len(p_chars) == len(g_chars): next_p_tree = Root(zone=p_tree.zone) words = [] for token in tokens[index + 1:]: if isinstance(token, MWT): words.extend(token.words) else: words.append(token) for word in words: if word in was_subroot: del word.misc['Rehanged'] if word.parent is not p_tree and word.parent not in words: if word.udeprel in FUNCTIONAL: word.parent.misc['FuncChildMissing'] = 'Yes' for child in word.children: if child not in words and child.udeprel in FUNCTIONAL: word.misc['FuncChildMissing'] = 'Yes' next_p_tree.steal_nodes(words) self.choose_root(p_tree, was_subroot, g_tree) self.choose_root(next_p_tree, was_subroot, document.bundles[bundle_no + 1].trees[0]) pred_trees.append(next_p_tree) bundle.add_tree(p_tree) break