def test_create_frequency_term_matrix(self): """ Once have vocab indexed create frequency_term matrix """ preprocessor = TextPreprocessor(self.articles) split_data = preprocessor.split_data() preprocessor.count_vect.fit_transform(split_data['train']) frequency_term_matrix = preprocessor.frequency_term_matrix(split_data['train']) #preprocessor.count_vect.transform(split_data['train']) self.assertTrue(hasattr(frequency_term_matrix, 'transpose'))
def test_tfidf_weighting(self): preprocessor = TextPreprocessor(self.articles) split_data = preprocessor.split_data() term_freq_matrix = preprocessor.frequency_term_matrix(split_data['train']) #calculate the idf for term frequency matrix with fit() preprocessor.tf_transformer.fit(term_freq_matrix) # once calculated transform the term_freq_matrix # to the tf-idf weight matrix tf_idf_matrix = preprocessor.tf_transformer.transform(term_freq_matrix) self.assertTrue(hasattr(tf_idf_matrix.todense(), 'shape'))
def test_term_frequency_features(self): """ tf-idf helper test The last step before classification """ #tfidf_transformer = TfidfTransformer() preprocessor = TextPreprocessor(self.articles) split_data = preprocessor.split_data() term_freq_matrix = preprocessor.frequency_term_matrix(split_data['train']) tfidf = preprocessor.tf_transformer.fit(term_freq_matrix) self.assertEqual(tfidf.norm, 'l2')