コード例 #1
0
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     corpus = super().__call__(corpus)
     if callback is None:
         callback = dummy_callback
     callback(0, "Transforming...")
     corpus = self._store_documents(corpus, wrap_callback(callback,
                                                          end=0.5))
     return self._store_tokens(corpus, wrap_callback(callback, start=0.5)) \
         if corpus.has_tokens() else corpus
コード例 #2
0
    def __call__(self,
                 corpus: Corpus,
                 callback: Callable = None,
                 **kw) -> Corpus:
        """ Marks tokens of a corpus with POS tags. """
        if callback is None:
            callback = dummy_callback
        corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))

        assert corpus.has_tokens()
        callback(0.2, "POS Tagging...")
        tags = np.array(self._preprocess(corpus.tokens, **kw), dtype=object)
        corpus.pos_tags = tags
        return corpus
コード例 #3
0
ファイル: preprocess.py プロジェクト: szzyiit/orange3-text
    def _store_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
        """
        Preprocess and set corpus.tokens.

        :param corpus: Corpus
        :param callback: progress callback function
        :return: Corpus
            Preprocessed corpus.
        """
        assert callback is not None
        assert corpus.has_tokens()
        tokens, n = [], len(corpus.tokens)
        for i, tokens_ in enumerate(corpus.tokens):
            callback(i / n)
            tokens.append([self._preprocess(s) for s in tokens_])
        corpus.store_tokens(tokens)
        return corpus
コード例 #4
0
ファイル: preprocess.py プロジェクト: szzyiit/orange3-text
 def __call__(self, corpus: Corpus, callback: Callable) -> Corpus:
     corpus = super().__call__(corpus)
     if not corpus.has_tokens():
         from orangecontrib.text.preprocess import BASE_TOKENIZER
         corpus = BASE_TOKENIZER(corpus, callback)
     return corpus
コード例 #5
0
ファイル: preprocess.py プロジェクト: szzyiit/orange3-text
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     corpus = super().__call__(corpus, callback)
     assert corpus.has_tokens()
     corpus.ngram_range = self.__range
     return corpus