def segment_text(self, text): """Segment the provided text into sentences.""" self.tokenizer.setText(text) is_another = True sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: sentences.append(u_sentence.getText()) return sentences
def _runApp(self, dto, opNotDone): text = dto.getText() tokenizer = self._model.newTokenizer(self._model.DEFAULT) tokenizer.setText(text) error = ProcessingError() sentence = Sentence() sid = 0 while tokenizer.nextSentence(sentence, error): self._model.tag(sentence, self._model.DEFAULT) self._model.parse(sentence, self._model.DEFAULT) # Teprolin tokenized sentence ttsent = [] # Teprolin string sentence tssent = sentence.getText() for w in sentence.words: if w.id == 0: continue tt = TeproTok() tt.setId(w.id) tt.setWordForm(w.form) tt.setCTAG(w.upostag) tt.setMSD(w.xpostag) tt.setLemma(w.lemma) tt.setHead(w.head) tt.setDepRel(w.deprel) ttsent.append(tt) # end for w if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()): dto.addSentenceString(tssent) dto.addSentenceTokens(ttsent) else: # Check and update annotations that only TTL # can produce or that are requested specifically from it. alignment = dto.alignSentences(ttsent, sid) for op in opNotDone: dto.copyTokenAnnotation(ttsent, sid, alignment, op) sentence = Sentence() sid += 1 # end all split sentences. return dto
def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ if root.children: raise ValueError( 'Tree already contained nodes before tokenization') # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) is_another = True u_sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: u_sentences.append(u_sentence) # If resegmentation was not required, we need to join the segments. if not resegment and len(u_sentences) > 1: first_sent = u_sentences[0] n_words = first_sent.words.size() - 1 for other_sent in u_sentences[1:]: other_words = other_sent.words.size() - 1 for i in range(1, other_words + 1): u_w = other_sent.words[i] n_words += 1 u_w.id = n_words first_sent.words.append(u_w) u_sentences = [first_sent] # tagging and parsing if tag: for u_sentence in u_sentences: self.tool.tag(u_sentence, Model.DEFAULT) if parse: self.tool.parse(u_sentence, Model.DEFAULT) elif parse: raise ValueError( 'Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes new_root = root trees = [] for u_sentence in u_sentences: if not new_root: new_root = Root() new_root.text = u_sentence.getText() if resegment else root.text heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): u_w = u_words[i] node = new_root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, ) if parse: heads.append(u_w.head) nodes.append(node) if parse: for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head] trees.append(new_root) new_root = None return trees