def test_define_features_tfidf(self): """ Test Case for Feature Defintion using Tfidf""" training_data, testing_data, training_y, testing_y = split_data( self.df_concatenated, 'text', 'hate_speech', 0.25) vectorizer, training_features, testing_features = define_features_tfidf( "text", training_data, testing_data) vec2, training_features2, testing_features2 = define_features_tfidf( "text", self.test_set) """ Test correct data types """ self.assertIsInstance(vectorizer, sklearn.feature_extraction.text.TfidfVectorizer) self.assertIsInstance(training_features, scipy.sparse.csr.csr_matrix) self.assertIsInstance(testing_features, scipy.sparse.csr.csr_matrix) """ Test None case """ self.assertIsInstance(vec2, sklearn.feature_extraction.text.TfidfVectorizer) self.assertIsInstance(training_features2, scipy.sparse.csr.csr_matrix) self.assertIsNone(testing_features2)
def test_define_features_vectorizer(self): """ Test Case for Feature Defintion using CountVectorizer""" training_data, testing_data, training_y, testing_y = split_data( self.df_concatenated, 'text', 'hate_speech', 0.25) vectorizer, training_features, testing_features = define_features_vectorizer( "text", training_data, testing_data) vec2, training_features2, testing_features2 = define_features_vectorizer( "text", self.test_set) """ Test correct data types """ self.assertIsInstance(vectorizer, sklearn.feature_extraction.text.CountVectorizer) self.assertIsInstance(training_features, scipy.sparse.csr.csr_matrix) self.assertIsInstance(testing_features, scipy.sparse.csr.csr_matrix) """ Test None case """ self.assertIsInstance(vec2, sklearn.feature_extraction.text.CountVectorizer) self.assertIsInstance(training_features2, scipy.sparse.csr.csr_matrix) self.assertIsNone(testing_features2) self.assertTrue( (training_features2.toarray() == self.test_result_count).all())
def setUp(self): self.df = load_data( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv')) self.df2, self.df3 = get_datasets( os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'), os.path.join(os.path.pardir, 'src', 'data', 'hatespeech_text_label_vote_RESTRICTED_100K.csv')) self.df_concatenated = concatenate_datasets( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'), self.df2, self.df3) self.training_data, self.testing_data, self.training_y, self.testing_y = split_data( self.df_concatenated, 'text', 'hate_speech', 0.25)