def test_corpus_preprocessed(self): """Check if details part of the summary is formatted correctly""" corpus = Corpus.from_file("book-excerpts") corpus = RegexpTokenizer()(corpus) n_features = len(corpus.domain.variables) + len(corpus.domain.metas) details = ( f"<nobr>{len(corpus)} instances, {n_features} variables</nobr><br/>" f"<nobr>Features: — (no missing values)</nobr><br/>" f"<nobr>Target: categorical</nobr><br/>" f"<nobr>Metas: string</nobr><br/>" f"<nobr>Tokens: 128020, Types: 11712</nobr>") summary = summarize.dispatch(Corpus)(corpus) self.assertEqual(140, summary.summary) self.assertEqual(details, summary.details)
def preprocess(corpus: Corpus) -> Corpus: for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), StopwordsFilter("English"), FrequencyFilter(0.1)): corpus = pp(corpus) transformed_corpus = BowVectorizer().transform(corpus) pca = PCA(n_components=2) pca_model = pca(transformed_corpus) projection = pca_model(transformed_corpus) domain = Domain( transformed_corpus.domain.attributes, transformed_corpus.domain.class_vars, chain(transformed_corpus.domain.metas, projection.domain.attributes)) return corpus.transform(domain)
def preprocess_only_words(corpus: Corpus) -> Corpus: """ Apply the preprocessor that splits words, transforms them to lower case (and removes punctuations). Parameters ---------- corpus Corpus on which the preprocessor will be applied. Returns ------- Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams. """ p = PreprocessorList( [LowercaseTransformer(), # by default regexp keeps only words (no punctuations, no spaces) RegexpTokenizer()] ) return p(corpus)
def test_result(self): pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()]) corpus = pp(Corpus.from_file("book-excerpts")[::3]) vect = BowVectorizer() corpus_vect = vect.transform(corpus) words = ["beheld", "events", "dragged", "basin", "visit", "have"] d = Domain([corpus_vect.domain[w] for w in words]) corpus_vect = corpus_vect.transform(d) self.send_signal(self.widget.Inputs.data, corpus_vect) self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1]) self.wait_until_finished(timeout=100000) np.testing.assert_array_almost_equal( self.widget.results.p_values, [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872], decimal=5, ) np.testing.assert_array_almost_equal( self.widget.results.fdr_values, [0.12766, 1, 0.12766, 0.12766, 0.12766, 1], decimal=5, )
self._invalidated = True def onDeleteWidget(self): self.shutdown() super().onDeleteWidget() if __name__ == "__main__": from Orange.projection import PCA from Orange.widgets.utils.widgetpreview import WidgetPreview from orangecontrib.text.preprocess import LowercaseTransformer, \ RegexpTokenizer, StopwordsFilter, FrequencyFilter from orangecontrib.text.vectorization import BowVectorizer corpus_ = Corpus.from_file("book-excerpts") for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"), StopwordsFilter("English"), FrequencyFilter(0.1)): corpus_ = pp(corpus_) transformed_corpus = BowVectorizer().transform(corpus_) pca = PCA(n_components=2) pca_model = pca(transformed_corpus) projection = pca_model(transformed_corpus) domain_ = Domain( transformed_corpus.domain.attributes, transformed_corpus.domain.class_vars, chain(transformed_corpus.domain.metas, projection.domain.attributes)) corpus_ = corpus_.transform(domain_)