def __normalize_document(self, document: str) -> List[str]: tokens = [] tokenizer = self.__model.newTokenizer(self.__model.DEFAULT) tokenizer.setText(document) error = udpipe.ProcessingError() sentence = udpipe.Sentence() while tokenizer.nextSentence(sentence, error): self.__model.tag(sentence, self.__model.DEFAULT) # 1: is used because words[0] is the root required by the dependency trees tokens.extend([w.lemma for w in sentence.words[1:]]) sentence = udpipe.Sentence() return tokens
def __normalize_document(self, document: str) -> List[str]: tokens = [] tokenizer = self.__model.newTokenizer(self.__model.DEFAULT) tokenizer.setText(document) error = udpipe.ProcessingError() sentence = udpipe.Sentence() while tokenizer.nextSentence(sentence, error): self.__model.tag(sentence, self.__model.DEFAULT) output = self.__output_format.writeSentence(sentence) sentence = udpipe.Sentence() tokens.extend([t['properties']['lemma'] for t in json.loads(output)['nodes']]) return tokens
def _read(self, text, input_format): input_format.setText(text) error = udpipe.ProcessingError() sentences = [] sentence = udpipe.Sentence() while input_format.nextSentence(sentence, error): sentences.append(sentence) sentence = udpipe.Sentence() if error.occurred(): raise Exception(error.message) return sentences
def normalize(self, token): self.load_model() sentence = udpipe.Sentence() sentence.addWord(token) self.model.tag(sentence, self.model.DEFAULT) output = self.output_format.writeSentence(sentence) return json.loads(output)['nodes'][0]['properties']['lemma']
def segment(self, text): sent = ud.Sentence() tokenizer = self.model.newTokenizer('ranges') tokenizer.setText(text) sentences = [] tokens = [] raw_tokens = [] sent_start = 0 while tokenizer.nextSentence(sent): words = sent.words[1:] sent_raw_tokens = [(word.getTokenRangeStart(), word.getTokenRangeEnd()) for word in words] sentences.append( Sentence(sent_start, sent_start + len(sent_raw_tokens))) tokens += [ text[raw_token[0]:raw_token[1]] for raw_token in sent_raw_tokens ] raw_tokens += sent_raw_tokens sent_start += len(sent_raw_tokens) return tokens, sentences, raw_tokens
def convert_to_ud(sentence: NavigableSentence) -> ud.Sentence: result = ud.Sentence() for token in sentence: word = result.addWord(token.text) # type: ud.Word if token.lemma: word.lemma = token.lemma if token.pos: if token.pos.upos: word.upostag = token.pos.upos if token.pos.xpos: word.xpostag = token.pos.xpos if token.pos.feats: word.feats = "|".join( sorted("{}={}".format(*item) for item in token.pos.feats.items())) return result
def __normalize_token(self, token: str) -> str: sentence = udpipe.Sentence() sentence.addWord(token) self.__model.tag(sentence, self.__model.DEFAULT) output = self.__output_format.writeSentence(sentence) return json.loads(output)['nodes'][0]['properties']['lemma']
def udpipe_lemmatizer(token): sentence = udpipe.Sentence() sentence.addWord(token) model.tag(sentence, model.DEFAULT) output = output_format.writeSentence(sentence) return json.loads(output)['nodes'][0]['properties']['lemma']
def __normalize_token(self, token: str) -> str: sentence = udpipe.Sentence() sentence.addWord(token) self.__model.tag(sentence, self.__model.DEFAULT) return sentence.words[1].lemma