Beispiel #1
0
 def setUp(self):
     self.corpus = Corpus.from_file("deerwester")
     self.pp_list = [preprocess.LowercaseTransformer(),
                     preprocess.WordPunctTokenizer(),
                     preprocess.SnowballStemmer(),
                     preprocess.NGrams(),
                     tag.AveragedPerceptronTagger()]
 def test_string_processor(self):
     p = preprocess.LowercaseTransformer()
     tokens2 = self.corpus.tokens.copy()
     tokens = p(self.corpus).tokens
     np.testing.assert_equal(
         tokens,
         np.array([[t.lower() for t in doc] for doc in tokens2],
                  dtype="object"))
Beispiel #3
0
 def create_corpus(texts: List[str]) -> Corpus:
     """ Create sample corpus with texts passed """
     text_var = StringVariable("Text")
     domain = Domain([], metas=[text_var])
     c = Corpus(
         domain,
         metas=np.array(texts).reshape(-1, 1),
         text_features=[text_var],
     )
     return preprocess.LowercaseTransformer()(c)
    def test_string_processor(self):
        p = Preprocessor(transformers=preprocess.LowercaseTransformer())
        tokens = p(self.corpus).tokens
        p2 = Preprocessor(transformers=[])
        tokens2 = p2(self.corpus).tokens

        np.testing.assert_equal(tokens,
                                [[t.lower() for t in doc] for doc in tokens2])

        self.assertRaises(TypeError, Preprocessor, string_transformers=1)
Beispiel #5
0
 def set_corpus(self, data=None):
     self.corpus = data
     # create preprocessed corpus upon setting data to avoid preprocessing
     # at each method run
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer()
     ]
     self.pp_corpus = PreprocessorList(pp_list)(self.corpus)
     self.commit()
Beispiel #6
0
 def set_corpus(self, data=None):
     self.corpus = data
     self.pp_corpus = None
     if self.corpus is not None:
         if not self.corpus.has_tokens():
             # create preprocessed corpus upon setting data to avoid
             # preprocessing at each method run
             pp_list = [
                 preprocess.LowercaseTransformer(),
                 preprocess.WordPunctTokenizer()
             ]
             self.pp_corpus = PreprocessorList(pp_list)(self.corpus)
         else:
             self.pp_corpus = self.corpus
     self.commit.now()
 def test_filter_pos_tags(self):
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer(),
         tag.AveragedPerceptronTagger(),
         preprocess.StopwordsFilter()
     ]
     corpus = self.corpus
     with corpus.unlocked():
         corpus.metas[0, 0] = "This is the most beautiful day in the world"
     for pp in pp_list:
         corpus = pp(corpus)
     self.assertEqual(len(corpus.tokens), len(corpus.pos_tags))
     self.assertEqual(len(corpus.tokens[0]), len(corpus.pos_tags[0]))
     self.assertEqual(corpus.tokens[0], ["beautiful", "day", "world"])
     self.assertEqual(corpus.pos_tags[0], ["JJ", "NN", "NN"])
Beispiel #8
0
    def setUp(self) -> None:
        self.widget: OWScoreDocuments = self.create_widget(OWScoreDocuments)

        # create corpus
        self.corpus = Corpus.from_file("book-excerpts")
        pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.StripAccentsTransformer(),
            preprocess.SnowballStemmer(),
        ]
        for p in pp_list:
            self.corpus = p(self.corpus)

        # create words table
        words = ["house", "doctor", "boy", "way", "Rum"]
        self.words = self.create_words_table(words)
 def test_preprocessed(self):
     widget = self.create_widget(OWSentimentAnalysis)
     corpus = self.corpus.copy()
     pp_list = [
         preprocess.LowercaseTransformer(),
         preprocess.WordPunctTokenizer()
     ]
     for pp in pp_list:
         corpus = pp(corpus)
     self.send_signal(widget.Inputs.corpus, corpus)
     self.assertTrue(widget.pp_corpus)
     widget.liu_hu.click()
     simulate.combobox_activate_item(widget.liu_lang, "English")
     self.assertTrue(widget.pp_corpus)
     self.send_signal(widget.Inputs.corpus, None)
     self.assertIsNone(widget.pp_corpus)
Beispiel #10
0
    def test_pickle_corpus(self):
        """
        Corpus must be picklable (for save data widget)
        gh-590
        """
        c = Corpus.from_file('book-excerpts')

        # it must also work with preprocessed corpus
        self.pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.WordPunctTokenizer(),
            preprocess.SnowballStemmer(),
            preprocess.FrequencyFilter(),
            preprocess.StopwordsFilter()
        ]
        for pp in self.pp_list:
            c = pp(c)
        pickle.dumps(c)
Beispiel #11
0
def pre_process(path):
    corpus = orangecontrib.text.Corpus.from_file(path)

    p = preprocess.Preprocessor(
        transformers=[
            preprocess.LowercaseTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer()
        ],
        tokenizer=preprocess.RegexpTokenizer('\w+'),
        normalizer=preprocess.PorterStemmer(),
        filters=[
            preprocess.StopwordsFilter('english'),
            preprocess.RegexpFilter(
                '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<'
            )
        ])
    return p(corpus)
    def test_string_processor(self):
        class StripStringTransformer(preprocess.BaseTransformer):
            @classmethod
            def transform(cls, string):
                return string[:-1]

        p = Preprocessor(transformers=StripStringTransformer())

        np.testing.assert_equal(
            p(self.corpus).tokens,
            np.array([[doc[:-1]] for doc in self.corpus.documents]))

        p = Preprocessor(transformers=[
            StripStringTransformer(),
            preprocess.LowercaseTransformer()
        ])

        np.testing.assert_equal(
            p(self.corpus).tokens,
            np.array([[doc[:-1].lower()] for doc in self.corpus.documents]))

        self.assertRaises(TypeError, Preprocessor, string_transformers=1)
Beispiel #13
0
    def test_preprocess_words(self):
        corpus = Corpus.from_file("book-excerpts")
        words = [
            "House",
            "dóctor",
            "boy",
            "way",
            "Rum https://google.com",
            "https://google.com",
            "<p>abra<b>cadabra</b><p>",
        ]

        pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.StripAccentsTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer(),
        ]
        for p in pp_list:
            corpus = p(corpus)

        self.assertListEqual(
            ["house", "doctor", "boy", "way", "rum", "abracadabra"],
            _preprocess_words(corpus, words, dummy_callback),
        )

        words = ["House", "dóctor", "boys", "way", "Rum"]

        pp_list = [preprocess.SnowballStemmer()]
        for p in pp_list:
            corpus = p(corpus)

        self.assertListEqual(
            ["hous", "doctor", "boy", "way", "rum"],
            _preprocess_words(corpus, words, dummy_callback),
        )
 def test_lowercase(self):
     transformer = preprocess.LowercaseTransformer()
     self.assertEqual(transformer._preprocess('Abra'), 'abra')
     self.assertEqual(transformer._preprocess('\u00C0bra'), '\u00E0bra')
Beispiel #15
0
            raise NotImplementedError

        self.view.selectionModel().select(selection,
                                          QItemSelectionModel.ClearAndSelect)


if __name__ == "__main__":
    from orangewidget.utils.widgetpreview import WidgetPreview

    from orangecontrib.text import preprocess

    corpus = Corpus.from_file("book-excerpts")
    # corpus.set_title_variable("Text")

    pp_list = [
        preprocess.LowercaseTransformer(),
        preprocess.StripAccentsTransformer(),
        preprocess.SnowballStemmer(),
    ]
    for p in pp_list:
        corpus = p(corpus)

    w = StringVariable("Words")
    w.attributes["type"] = "words"
    words = ["house", "doctor", "boy", "way", "Rum"]
    words = Table(
        Domain([], metas=[w]),
        np.empty((len(words), 0)),
        metas=np.array(words).reshape((-1, 1)),
    )
    WidgetPreview(OWScoreDocuments).run(set_data=corpus, set_words=words)