def __normalize_document(self, document: str) -> List[str]: tokens = [] tokenizer = self.__model.newTokenizer(self.__model.DEFAULT) tokenizer.setText(document) error = udpipe.ProcessingError() sentence = udpipe.Sentence() while tokenizer.nextSentence(sentence, error): self.__model.tag(sentence, self.__model.DEFAULT) # 1: is used because words[0] is the root required by the dependency trees tokens.extend([w.lemma for w in sentence.words[1:]]) sentence = udpipe.Sentence() return tokens
def __normalize_document(self, document: str) -> List[str]: tokens = [] tokenizer = self.__model.newTokenizer(self.__model.DEFAULT) tokenizer.setText(document) error = udpipe.ProcessingError() sentence = udpipe.Sentence() while tokenizer.nextSentence(sentence, error): self.__model.tag(sentence, self.__model.DEFAULT) output = self.__output_format.writeSentence(sentence) sentence = udpipe.Sentence() tokens.extend([t['properties']['lemma'] for t in json.loads(output)['nodes']]) return tokens
def _read(self, text, input_format): input_format.setText(text) error = udpipe.ProcessingError() sentences = [] sentence = udpipe.Sentence() while input_format.nextSentence(sentence, error): sentences.append(sentence) sentence = udpipe.Sentence() if error.occurred(): raise Exception(error.message) return sentences
def parse_text(self,txt): err=udpipe.ProcessingError() tokenized="" current_block=[] for line in txt.split("\n"): if re.match(comment_regex, line.lstrip()): # comment line if current_block: tokenized+=self.pipeline.process("\n".join(current_block),err) current_block=[] tokenized+=line.lstrip()+"\n"#re.sub(comment_regex, "# ", line.lstrip()+"\n") continue # normal text line, save to current block to be tokenized current_block.append(line) if current_block: tokenized+=self.pipeline.process("\n".join(current_block),err) return tokenized