def test_phrases_not_tokenized_yet(self): s = pd.Series([ "New York is a beautiful city", "Look: New York!", "Very beautiful city New York", ]) s_true = pd.Series([ ["New", "York", "is", "a", "beautiful", "city"], ["Look", ":", "New", "York", "!"], ["Very", "beautiful", "city", "New", "York"], ]) with warnings.catch_warnings(): # avoid print warning warnings.simplefilter("ignore") self.assertEqual(preprocessing.phrases(s), s_true) with self.assertWarns(DeprecationWarning): # check raise warning preprocessing.phrases(s)
def test_phrases_threshold(self): s = pd.Series([ ["New", "York", "is", "a", "beautiful", "city"], ["Look", ":", "New", "York", "!"], ["Very", "beautiful", "city", "New", "York"], ]) s_true = pd.Series([ ["New_York", "is", "a", "beautiful", "city"], ["Look", ":", "New_York", "!"], ["Very", "beautiful", "city", "New_York"], ]) self.assertEqual(preprocessing.phrases(s, min_count=2, threshold=2), s_true)