def test_url_remover(self):
     url_remover = preprocess.UrlRemover()
     self.assertEqual(
         url_remover.transform('some link to https://google.com/'),
         'some link to ')
     self.assertEqual(url_remover.transform('some link to google.com'),
                      'some link to google.com')
Beispiel #2
0
 def test_url_remover(self):
     remover = preprocess.UrlRemover()
     self.corpus.metas[0, 0] = 'some link to https://google.com/'
     self.corpus.metas[1, 0] = 'some link to google.com'
     corpus = remover(self.corpus)
     self.assertListEqual(corpus.pp_documents[:2],
                          ['some link to ', 'some link to google.com'])
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 1)
Beispiel #3
0
    def setUp(self) -> None:
        self.widget: OWScoreDocuments = self.create_widget(OWScoreDocuments)

        # create corpus
        self.corpus = Corpus.from_file("book-excerpts")
        pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.StripAccentsTransformer(),
            preprocess.UrlRemover(),
            preprocess.SnowballStemmer(),
        ]
        for p in pp_list:
            self.corpus = p(self.corpus)

        # create words table
        words = ["house", "doctor", "boy", "way", "Rum"]
        self.words = create_words_table(words)
Beispiel #4
0
def pre_process(path):
    corpus = orangecontrib.text.Corpus.from_file(path)

    p = preprocess.Preprocessor(
        transformers=[
            preprocess.LowercaseTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer()
        ],
        tokenizer=preprocess.RegexpTokenizer('\w+'),
        normalizer=preprocess.PorterStemmer(),
        filters=[
            preprocess.StopwordsFilter('english'),
            preprocess.RegexpFilter(
                '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<'
            )
        ])
    return p(corpus)
Beispiel #5
0
    def test_preprocess_words(self):
        corpus = Corpus.from_file("book-excerpts")
        words = [
            "House",
            "dóctor",
            "boy",
            "way",
            "Rum https://google.com",
            "https://google.com",
            "<p>abra<b>cadabra</b><p>",
        ]

        pp_list = [
            preprocess.LowercaseTransformer(),
            preprocess.StripAccentsTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer(),
        ]
        for p in pp_list:
            corpus = p(corpus)

        self.assertListEqual(
            ["house", "doctor", "boy", "way", "rum", "abracadabra"],
            _preprocess_words(corpus, words, dummy_callback),
        )

        words = ["House", "dóctor", "boys", "way", "Rum"]

        pp_list = [preprocess.SnowballStemmer()]
        for p in pp_list:
            corpus = p(corpus)

        self.assertListEqual(
            ["hous", "doctor", "boy", "way", "rum"],
            _preprocess_words(corpus, words, dummy_callback),
        )
 def test_can_pickle(self):
     transformer = preprocess.UrlRemover()
     loaded = pickle.loads(pickle.dumps(transformer))
     self.assertEqual(loaded.urlfinder, transformer.urlfinder)
 def test_can_deepcopy(self):
     transformer = preprocess.UrlRemover()
     copied = copy.deepcopy(transformer)
     self.assertEqual(copied.urlfinder, transformer.urlfinder)