def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip() return root
def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() # if readline() returns an empty string, the end of the file has been # reached, while a blank line is represented by '\n' # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None if self.ignore_empty_lines: while line in {'\n', '\r\n'}: line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip(self.rstrip) return root
def read_tree(self, document=None): if self.filehandle is None: return None line = self.filehandle.readline() # if readline() returns an empty string, the end of the file has been # reached, while a blank line is represented by '\n' # (or '\r\n' if reading a Windows file on Unix machine). if line == '': return None if self.ignore_empty_lines: while line in {'\n', '\r\n'}: line = self.filehandle.readline() if line == '': return None root = Root() root.text = line.rstrip() return root
def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ if root.children: raise ValueError( 'Tree already contained nodes before tokenization') # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) is_another = True u_sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: u_sentences.append(u_sentence) # If resegmentation was not required, we need to join the segments. if not resegment and len(u_sentences) > 1: first_sent = u_sentences[0] n_words = first_sent.words.size() - 1 for other_sent in u_sentences[1:]: other_words = other_sent.words.size() - 1 for i in range(1, other_words + 1): u_w = other_sent.words[i] n_words += 1 u_w.id = n_words first_sent.words.append(u_w) u_sentences = [first_sent] # tagging and parsing if tag: for u_sentence in u_sentences: self.tool.tag(u_sentence, Model.DEFAULT) if parse: self.tool.parse(u_sentence, Model.DEFAULT) elif parse: raise ValueError( 'Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes new_root = root trees = [] for u_sentence in u_sentences: if not new_root: new_root = Root() new_root.text = u_sentence.getText() if resegment else root.text heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): u_w = u_words[i] node = new_root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, ) if parse: heads.append(u_w.head) nodes.append(node) if parse: for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head] trees.append(new_root) new_root = None return trees