コード例 #1
0
ファイル: normalize.py プロジェクト: biolab/orange3-text
 def __normalize_document(self, document: str) -> List[str]:
     tokens = []
     tokenizer = self.__model.newTokenizer(self.__model.DEFAULT)
     tokenizer.setText(document)
     error = udpipe.ProcessingError()
     sentence = udpipe.Sentence()
     while tokenizer.nextSentence(sentence, error):
         self.__model.tag(sentence, self.__model.DEFAULT)
         # 1: is used because words[0] is the root required by the dependency trees
         tokens.extend([w.lemma for w in sentence.words[1:]])
         sentence = udpipe.Sentence()
     return tokens
コード例 #2
0
ファイル: normalize.py プロジェクト: scoobiii/orange3-text
 def __normalize_document(self, document: str) -> List[str]:
     tokens = []
     tokenizer = self.__model.newTokenizer(self.__model.DEFAULT)
     tokenizer.setText(document)
     error = udpipe.ProcessingError()
     sentence = udpipe.Sentence()
     while tokenizer.nextSentence(sentence, error):
         self.__model.tag(sentence, self.__model.DEFAULT)
         output = self.__output_format.writeSentence(sentence)
         sentence = udpipe.Sentence()
         tokens.extend([t['properties']['lemma']
                        for t in json.loads(output)['nodes']])
     return tokens
コード例 #3
0
    def _read(self, text, input_format):
        input_format.setText(text)
        error = udpipe.ProcessingError()
        sentences = []

        sentence = udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = udpipe.Sentence()

        if error.occurred():
            raise Exception(error.message)

        return sentences
コード例 #4
0
 def parse_text(self,txt):
     err=udpipe.ProcessingError()
     tokenized=""
     current_block=[]
     for line in txt.split("\n"):
         if re.match(comment_regex, line.lstrip()): # comment line
             if current_block:
                 tokenized+=self.pipeline.process("\n".join(current_block),err)
                 current_block=[]
             tokenized+=line.lstrip()+"\n"#re.sub(comment_regex, "# ", line.lstrip()+"\n")
             continue
         # normal text line, save to current block to be tokenized
         current_block.append(line)
     if current_block:
         tokenized+=self.pipeline.process("\n".join(current_block),err)
     return tokenized