Python PreprocessorList Examples

Programming Language: Python

Namespace/Package Name: orangecontrib.text.preprocess

Class/Type: PreprocessorList

Examples at hotexamples.com: 10

Python PreprocessorList - 10 examples found. These are the top rated real world Python examples of orangecontrib.text.preprocess.PreprocessorList extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PreprocessorList(10)

Frequently Used Methods

PreprocessorList (10)

Example #1

Show file

File: corpus.py Project: biolab/orange3-text

    def _setup_corpus(self, text_features: List[Variable] = None) -> None:
        """
        Parameters
        ----------
        text_features
            meta attributes that are used for text mining. Infer them if None.
        """
        self.text_features = []  # list of text features for mining
        self._tokens = None
        self._dictionary = None
        self.ngram_range = (1, 1)
        self.attributes = {}
        self._pos_tags = None
        from orangecontrib.text.preprocess import PreprocessorList
        self.__used_preprocessor = PreprocessorList(
            [])  # required for compute values
        self._titles: Optional[np.ndarray] = None
        self._pp_documents = None  # preprocessed documents

        if text_features is None:
            self._infer_text_features()
        else:
            self.set_text_features(text_features)

        self._set_unique_titles()

Example #2

Show file

File: corpus.py Project: scoobiii/orange3-text

    def used_preprocessor(self, pp):
        from orangecontrib.text.preprocess import PreprocessorList, Preprocessor

        if isinstance(pp, PreprocessorList):
            self.__used_preprocessor = PreprocessorList(list(pp.preprocessors))
        elif isinstance(pp, Preprocessor):
            self.__used_preprocessor.preprocessors.append(pp)
        else:
            raise NotImplementedError

Example #3

Show file

File: corpus.py Project: scoobiii/orange3-text

    def _base_tokens(self):
        from orangecontrib.text.preprocess import BASE_TRANSFORMER, \
            BASE_TOKENIZER, PreprocessorList

        # don't use anything that requires NLTK data to assure async download
        base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
                                               BASE_TOKENIZER])
        corpus = base_preprocessors(self)
        return corpus.tokens, corpus.dictionary

Example #4

Show file

 def set_corpus(self, data=None):
     self.corpus = data
     # create preprocessed corpus upon setting data to avoid preprocessing
     # at each method run
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer()
     ]
     self.pp_corpus = PreprocessorList(pp_list)(self.corpus)
     self.commit()

Example #5

Show file

File: test_preprocess.py Project: larazupan/orange3-text

    def test_used_preprocessors(self):
        corpus1 = self.corpus.copy()
        for pp in self.pp_list:
            corpus1 = pp(corpus1)
        self.assertEqual(len(self.corpus.used_preprocessor.preprocessors), 0)
        self.assertEqual(len(corpus1.used_preprocessor.preprocessors), 5)

        self.assertEqual([8, 10, 6, 8, 9, 7, 7, 10, 4],
                         list(map(len, corpus1._tokens)))

        corpus2 = PreprocessorList(self.pp_list)(self.corpus)
        self.assertEqual(corpus1, corpus2)

Example #6

Show file

    def __init__(self,
                 domain=None,
                 X=None,
                 Y=None,
                 metas=None,
                 W=None,
                 text_features=None,
                 ids=None):
        """
        Args:
            domain (Orange.data.Domain): the domain for this Corpus
            X (numpy.ndarray): attributes
            Y (numpy.ndarray): class variables
            metas (numpy.ndarray): meta attributes; e.g. text
            W (numpy.ndarray): instance weights
            text_features (list): meta attributes that are used for
                text mining. Infer them if None.
            ids (numpy.ndarray): Indices
        """
        super().__init__()
        n_doc = _check_arrays(X, Y, metas)

        with self.unlocked_reference():
            self.X = X if X is not None else np.zeros((n_doc, 0))
            self.Y = Y if Y is not None else np.zeros((n_doc, 0))
            self.metas = metas if metas is not None else np.zeros((n_doc, 0))
            self.W = W if W is not None else np.zeros((n_doc, 0))
        self.domain = domain
        self.text_features = []  # list of text features for mining
        self._tokens = None
        self._dictionary = None
        self._ngrams_corpus = None
        self.ngram_range = (1, 1)
        self.attributes = {}
        self._pos_tags = None
        from orangecontrib.text.preprocess import PreprocessorList
        self.__used_preprocessor = PreprocessorList(
            [])  # required for compute values
        self._titles: Optional[np.ndarray] = None
        self._pp_documents = None  # preprocessed documents

        if domain is not None and text_features is None:
            self._infer_text_features()
        elif domain is not None:
            self.set_text_features(text_features)

        if ids is not None:
            self.ids = ids
        else:
            Table._init_ids(self)
        self._set_unique_titles()

Example #7

Show file

 def set_corpus(self, data=None):
     self.corpus = data
     self.pp_corpus = None
     if self.corpus is not None:
         if not self.corpus.has_tokens():
             # create preprocessed corpus upon setting data to avoid
             # preprocessing at each method run
             pp_list = [
                 preprocess.LowercaseTransformer(),
                 preprocess.WordPunctTokenizer()
             ]
             self.pp_corpus = PreprocessorList(pp_list)(self.corpus)
         else:
             self.pp_corpus = self.corpus
     self.commit.now()

Example #8

Show file

File: owstatistics.py Project: szzyiit/orange3-text

def preprocess_only_words(corpus: Corpus) -> Corpus:
    """
    Apply the preprocessor that splits words, transforms them to lower case
    (and removes punctuations).

    Parameters
    ----------
    corpus
        Corpus on which the preprocessor will be applied.

    Returns
    -------
    Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
    """
    p = PreprocessorList(
        [LowercaseTransformer(),
         # by default regexp keeps only words (no punctuations, no spaces)
         RegexpTokenizer()]
    )
    return p(corpus)

Example #9

Show file

File: test_owwordenrichment.py Project: larazupan/orange3-text

    def test_result(self):
        pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()])
        corpus = pp(Corpus.from_file("book-excerpts")[::3])
        vect = BowVectorizer()
        corpus_vect = vect.transform(corpus)

        words = ["beheld", "events", "dragged", "basin", "visit", "have"]
        d = Domain([corpus_vect.domain[w] for w in words])
        corpus_vect = corpus_vect.transform(d)

        self.send_signal(self.widget.Inputs.data, corpus_vect)
        self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1])
        self.wait_until_finished(timeout=100000)

        np.testing.assert_array_almost_equal(
            self.widget.results.p_values,
            [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872],
            decimal=5,
        )
        np.testing.assert_array_almost_equal(
            self.widget.results.fdr_values,
            [0.12766, 1, 0.12766, 0.12766, 0.12766, 1],
            decimal=5,
        )

Example #10

Show file

File: test_preprocess.py Project: larazupan/orange3-text

 def test_apply_preprocessors(self):
     corpus = PreprocessorList(self.pp_list)(self.corpus)
     self.assertEqual([8, 10, 6, 8, 9, 7, 7, 10, 4],
                      list(map(len, corpus._tokens)))
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 5)