Example #1
0
    def test_phrases_not_tokenized_yet(self):
        s = pd.Series([
            "New York is a beautiful city",
            "Look: New York!",
            "Very beautiful city New York",
        ])

        s_true = pd.Series([
            ["New", "York", "is", "a", "beautiful", "city"],
            ["Look", ":", "New", "York", "!"],
            ["Very", "beautiful", "city", "New", "York"],
        ])

        with warnings.catch_warnings():  # avoid print warning
            warnings.simplefilter("ignore")
            self.assertEqual(preprocessing.phrases(s), s_true)

        with self.assertWarns(DeprecationWarning):  # check raise warning
            preprocessing.phrases(s)
Example #2
0
    def test_phrases_threshold(self):
        s = pd.Series([
            ["New", "York", "is", "a", "beautiful", "city"],
            ["Look", ":", "New", "York", "!"],
            ["Very", "beautiful", "city", "New", "York"],
        ])

        s_true = pd.Series([
            ["New_York", "is", "a", "beautiful", "city"],
            ["Look", ":", "New_York", "!"],
            ["Very", "beautiful", "city", "New_York"],
        ])

        self.assertEqual(preprocessing.phrases(s, min_count=2, threshold=2),
                         s_true)