コード例 #1
0
    def __call__(self,
                 corpus: Corpus,
                 callback: Callable = None,
                 **kw) -> Corpus:
        """ Marks tokens of a corpus with POS tags. """
        if callback is None:
            callback = dummy_callback
        corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))

        assert corpus.has_tokens()
        callback(0.2, "POS Tagging...")
        tags = np.array(self._preprocess(corpus.tokens, **kw), dtype=object)
        corpus.pos_tags = tags
        return corpus
コード例 #2
0
    def _store_tokens_from_documents(self, corpus: Corpus,
                                     callback: Callable) -> Corpus:
        """
        Create tokens from documents and set corpus.tokens.

        :param corpus: Corpus
        :param callback: progress callback function
        :return: Corpus
            Preprocessed corpus.
        """
        assert callback is not None
        tokens, n = [], len(corpus.pp_documents)
        for i, doc in enumerate(corpus.pp_documents):
            callback(i / n)
            tokens.append(self._preprocess(doc))
        corpus.pos_tags = None
        corpus.store_tokens(tokens)
        return corpus
コード例 #3
0
ファイル: filter.py プロジェクト: larazupan/orange3-text
 def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
     if corpus.pos_tags is None:
         return corpus
     callback(0, "Filtering...")
     filtered_tags = []
     filtered_tokens = []
     for tags, tokens in zip(corpus.pos_tags, corpus.tokens):
         tmp_tags = []
         tmp_tokens = []
         for tag, token in zip(tags, tokens):
             # should we consider partial matches, i.e. "NN" for "NNS"?
             if tag in self._tags:
                 tmp_tags.append(tag)
                 tmp_tokens.append(token)
         filtered_tags.append(tmp_tags)
         filtered_tokens.append(tmp_tokens)
     corpus.store_tokens(filtered_tokens)
     corpus.pos_tags = filtered_tags
     return corpus
コード例 #4
0
ファイル: filter.py プロジェクト: larazupan/orange3-text
 def _filter_tokens(self,
                    corpus: Corpus,
                    callback: Callable,
                    dictionary=None) -> Corpus:
     callback(0, "Filtering...")
     filtered_tokens = []
     filtered_tags = []
     for i, tokens in enumerate(corpus.tokens):
         filter_map = self._preprocess(tokens)
         filtered_tokens.append(list(compress(tokens, filter_map)))
         if corpus.pos_tags is not None:
             filtered_tags.append(
                 list(compress(corpus.pos_tags[i], filter_map)))
     if dictionary is None:
         corpus.store_tokens(filtered_tokens)
     else:
         corpus.store_tokens(filtered_tokens, dictionary)
     if filtered_tags:
         corpus.pos_tags = np.array(filtered_tags, dtype=object)
     return corpus