Example #1
0
    def test_corpus_preprocessed(self):
        """Check if details part of the summary is formatted correctly"""
        corpus = Corpus.from_file("book-excerpts")
        corpus = RegexpTokenizer()(corpus)

        n_features = len(corpus.domain.variables) + len(corpus.domain.metas)
        details = (
            f"<nobr>{len(corpus)} instances, {n_features} variables</nobr><br/>"
            f"<nobr>Features: — (no missing values)</nobr><br/>"
            f"<nobr>Target: categorical</nobr><br/>"
            f"<nobr>Metas: string</nobr><br/>"
            f"<nobr>Tokens: 128020, Types: 11712</nobr>")
        summary = summarize.dispatch(Corpus)(corpus)
        self.assertEqual(140, summary.summary)
        self.assertEqual(details, summary.details)
Example #2
0
def preprocess(corpus: Corpus) -> Corpus:
    for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
               StopwordsFilter("English"), FrequencyFilter(0.1)):
        corpus = pp(corpus)

    transformed_corpus = BowVectorizer().transform(corpus)

    pca = PCA(n_components=2)
    pca_model = pca(transformed_corpus)
    projection = pca_model(transformed_corpus)

    domain = Domain(
        transformed_corpus.domain.attributes,
        transformed_corpus.domain.class_vars,
        chain(transformed_corpus.domain.metas, projection.domain.attributes))
    return corpus.transform(domain)
Example #3
0
def preprocess_only_words(corpus: Corpus) -> Corpus:
    """
    Apply the preprocessor that splits words, transforms them to lower case
    (and removes punctuations).

    Parameters
    ----------
    corpus
        Corpus on which the preprocessor will be applied.

    Returns
    -------
    Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams.
    """
    p = PreprocessorList(
        [LowercaseTransformer(),
         # by default regexp keeps only words (no punctuations, no spaces)
         RegexpTokenizer()]
    )
    return p(corpus)
    def test_result(self):
        pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()])
        corpus = pp(Corpus.from_file("book-excerpts")[::3])
        vect = BowVectorizer()
        corpus_vect = vect.transform(corpus)

        words = ["beheld", "events", "dragged", "basin", "visit", "have"]
        d = Domain([corpus_vect.domain[w] for w in words])
        corpus_vect = corpus_vect.transform(d)

        self.send_signal(self.widget.Inputs.data, corpus_vect)
        self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1])
        self.wait_until_finished(timeout=100000)

        np.testing.assert_array_almost_equal(
            self.widget.results.p_values,
            [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872],
            decimal=5,
        )
        np.testing.assert_array_almost_equal(
            self.widget.results.fdr_values,
            [0.12766, 1, 0.12766, 0.12766, 0.12766, 1],
            decimal=5,
        )
Example #5
0
        self._invalidated = True

    def onDeleteWidget(self):
        self.shutdown()
        super().onDeleteWidget()


if __name__ == "__main__":
    from Orange.projection import PCA
    from Orange.widgets.utils.widgetpreview import WidgetPreview
    from orangecontrib.text.preprocess import LowercaseTransformer, \
        RegexpTokenizer, StopwordsFilter, FrequencyFilter
    from orangecontrib.text.vectorization import BowVectorizer

    corpus_ = Corpus.from_file("book-excerpts")
    for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
               StopwordsFilter("English"), FrequencyFilter(0.1)):
        corpus_ = pp(corpus_)

    transformed_corpus = BowVectorizer().transform(corpus_)

    pca = PCA(n_components=2)
    pca_model = pca(transformed_corpus)
    projection = pca_model(transformed_corpus)

    domain_ = Domain(
        transformed_corpus.domain.attributes,
        transformed_corpus.domain.class_vars,
        chain(transformed_corpus.domain.metas, projection.domain.attributes))
    corpus_ = corpus_.transform(domain_)