Beispiel #1
0
 def __normalize_document(self, document: str) -> List[str]:
     tokens = []
     tokenizer = self.__model.newTokenizer(self.__model.DEFAULT)
     tokenizer.setText(document)
     error = udpipe.ProcessingError()
     sentence = udpipe.Sentence()
     while tokenizer.nextSentence(sentence, error):
         self.__model.tag(sentence, self.__model.DEFAULT)
         # 1: is used because words[0] is the root required by the dependency trees
         tokens.extend([w.lemma for w in sentence.words[1:]])
         sentence = udpipe.Sentence()
     return tokens
Beispiel #2
0
 def __normalize_document(self, document: str) -> List[str]:
     tokens = []
     tokenizer = self.__model.newTokenizer(self.__model.DEFAULT)
     tokenizer.setText(document)
     error = udpipe.ProcessingError()
     sentence = udpipe.Sentence()
     while tokenizer.nextSentence(sentence, error):
         self.__model.tag(sentence, self.__model.DEFAULT)
         output = self.__output_format.writeSentence(sentence)
         sentence = udpipe.Sentence()
         tokens.extend([t['properties']['lemma']
                        for t in json.loads(output)['nodes']])
     return tokens
Beispiel #3
0
    def _read(self, text, input_format):
        input_format.setText(text)
        error = udpipe.ProcessingError()
        sentences = []

        sentence = udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = udpipe.Sentence()

        if error.occurred():
            raise Exception(error.message)

        return sentences
Beispiel #4
0
 def normalize(self, token):
     self.load_model()
     sentence = udpipe.Sentence()
     sentence.addWord(token)
     self.model.tag(sentence, self.model.DEFAULT)
     output = self.output_format.writeSentence(sentence)
     return json.loads(output)['nodes'][0]['properties']['lemma']
Beispiel #5
0
    def segment(self, text):
        sent = ud.Sentence()
        tokenizer = self.model.newTokenizer('ranges')
        tokenizer.setText(text)

        sentences = []
        tokens = []
        raw_tokens = []
        sent_start = 0

        while tokenizer.nextSentence(sent):
            words = sent.words[1:]
            sent_raw_tokens = [(word.getTokenRangeStart(),
                                word.getTokenRangeEnd()) for word in words]

            sentences.append(
                Sentence(sent_start, sent_start + len(sent_raw_tokens)))
            tokens += [
                text[raw_token[0]:raw_token[1]]
                for raw_token in sent_raw_tokens
            ]

            raw_tokens += sent_raw_tokens
            sent_start += len(sent_raw_tokens)

        return tokens, sentences, raw_tokens
def convert_to_ud(sentence: NavigableSentence) -> ud.Sentence:
    result = ud.Sentence()
    for token in sentence:
        word = result.addWord(token.text)  # type: ud.Word
        if token.lemma:
            word.lemma = token.lemma
        if token.pos:
            if token.pos.upos:
                word.upostag = token.pos.upos
            if token.pos.xpos:
                word.xpostag = token.pos.xpos
            if token.pos.feats:
                word.feats = "|".join(
                    sorted("{}={}".format(*item)
                           for item in token.pos.feats.items()))
    return result
Beispiel #7
0
 def __normalize_token(self, token: str) -> str:
     sentence = udpipe.Sentence()
     sentence.addWord(token)
     self.__model.tag(sentence, self.__model.DEFAULT)
     output = self.__output_format.writeSentence(sentence)
     return json.loads(output)['nodes'][0]['properties']['lemma']
Beispiel #8
0
 def udpipe_lemmatizer(token):
     sentence = udpipe.Sentence()
     sentence.addWord(token)
     model.tag(sentence, model.DEFAULT)
     output = output_format.writeSentence(sentence)
     return json.loads(output)['nodes'][0]['properties']['lemma']
Beispiel #9
0
 def __normalize_token(self, token: str) -> str:
     sentence = udpipe.Sentence()
     sentence.addWord(token)
     self.__model.tag(sentence, self.__model.DEFAULT)
     return sentence.words[1].lemma