def test_count_not_tokenized_yet(self): s = pd.Series("a b c c") s_true = pd.Series([[1, 1, 2]]) with warnings.catch_warnings(): # avoid print warning warnings.simplefilter("ignore") self.assertEqual(representation.count(s), s_true) with self.assertWarns(DeprecationWarning): # check raise warning representation.count(s)
], ["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_text, )], ["remove_brackets", preprocessing.remove_brackets, (s_text, )], ["remove_html_tags", preprocessing.remove_html_tags, (s_text, )], ["tokenize", preprocessing.tokenize, (s_text, )], ["phrases", preprocessing.phrases, (s_tokenized_lists, )], ["replace_urls", preprocessing.replace_urls, (s_text, "")], ["remove_urls", preprocessing.remove_urls, (s_text, )], ["replace_tags", preprocessing.replace_tags, (s_text, "")], ["remove_tags", preprocessing.remove_tags, (s_text, )], ] test_cases_representation = [ [ "count", lambda x: representation.flatten(representation.count(x)), (s_tokenized_lists, ), ], [ "term_frequency", lambda x: representation.flatten(representation.term_frequency(x)), (s_tokenized_lists, ), ], [ "tfidf", lambda x: representation.flatten(representation.tfidf(x)), (s_tokenized_lists, ), ], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists, )], ["tsne", representation.tsne, (s_numeric_lists, )],
def test_count_punctuation_are_kept(self): s = pd.Series(["one !"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1, 1]]) self.assertEqual(representation.count(s), s_true)
def test_count_not_lowercase(self): s = pd.Series(["one ONE"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1, 1]]) self.assertEqual(representation.count(s), s_true)
def test_count_multiple_documents(self): s = pd.Series(["doc_one", "doc_two"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1, 0], [0, 1]]) self.assertEqual(representation.count(s), s_true)
def test_count_single_document(self): s = pd.Series("a b c c") s = preprocessing.tokenize(s) s_true = pd.Series([[1, 1, 2]]) self.assertEqual(representation.count(s), s_true)