Esempio n. 1
0
 def test_valid_params(self):
     text_processor = TextPreprocessor(spacy_model_id='es', remove_stop_words=True,
                                       lemmatize=True, additional_pipes=[pipe_sample])
     text_processor.fit(self.sample_en)
     transformed_text = text_processor.transform(self.sample_en)
     self.assertEqual(len(self.sample_en), len(transformed_text))
     for doc in transformed_text:
         self.assertTrue(len(doc) != 0)
Esempio n. 2
0
    def test_sklearn_pipeline(self):
        text_preprocessor = TextPreprocessor(spacy_model_id='en', lemmatize=True)
        tf_idf_vectorizer = TfidfVectorizer(input='content', tokenizer=lambda x: x,
                                            preprocessor=None, lowercase=False)
        bayes_clf = MultinomialNB()

        pipe = Pipeline(steps=[
            ('preprocessing', text_preprocessor),
            ('tf-idf', tf_idf_vectorizer),
            ('clf', bayes_clf)
        ])
        pipe.fit(self.sample_en, self.labels_en)
Esempio n. 3
0
    def test_search_cv(self):
        pipe = Pipeline(steps=[
            ('txt_prep', TextPreprocessor()),
            ('tf-idf', TfidfVectorizer(input='content', tokenizer=do_nothing,
                                       preprocessor=None, lowercase=False)),
            ('clf', MultinomialNB())
        ])

        param_grid = {
            'txt_prep__spacy_model_id': ['en', 'es'],
            'txt_prep__lemmatize': [True, False],
            'txt_prep__remove_stop_words': [True, False],
            'txt_prep__additional_pipes': [None, [pipe_sample]]
        }

        grid_search = GridSearchCV(pipe, param_grid, n_jobs=-1, iid=False, cv=2)
        grid_search.fit(self.sample_en, self.labels_en)
Esempio n. 4
0
    def test_invalid_params(self):
        invalid_stop_word_type = TextPreprocessor(remove_stop_words=2)
        with pytest.raises(InvalidArgumentError):
            invalid_stop_word_type.fit(self.sample_en)

        invalid_lemma_type = TextPreprocessor(lemmatize=3)
        with pytest.raises(InvalidArgumentError):
            invalid_lemma_type.fit(self.sample_en)

        invalid_add_pipe_type = TextPreprocessor(additional_pipes='hi')
        with pytest.raises(InvalidArgumentError):
            invalid_add_pipe_type.fit(self.sample_en)

        invalid_model_id = TextPreprocessor(spacy_model_id='invented')
        with pytest.raises(InvalidArgumentError):
            invalid_model_id.fit(self.sample_en)
Esempio n. 5
0
 def test_transform_before_fit(self):
     preprocessor = TextPreprocessor()
     with pytest.raises(NotFittedError):
         preprocessor.transform(self.sample_en)