Esempio n. 1
0
    def test_idf_not_tokenized_yet(self):
        s = pd.Series("a")
        s_true = pd.Series([[1]])

        with warnings.catch_warnings():  # avoid print warning
            warnings.simplefilter("ignore")
            self.assertEqual(representation.tfidf(s), s_true)

        with self.assertWarns(DeprecationWarning):  # check raise warning
            representation.tfidf(s)
Esempio n. 2
0
    def test_idf_single_not_lowercase(self):
        tfidf_single_smooth = 0.7071067811865475  # TODO

        s = pd.Series("ONE one")
        s = preprocessing.tokenize(s)
        s_true = pd.Series([[tfidf_single_smooth, tfidf_single_smooth]])
        self.assertEqual(representation.tfidf(s), s_true)
Esempio n. 3
0
    def test_tfidf_formula(self):
        s = pd.Series(["Hi Bye", "Test Bye Bye"])
        s = preprocessing.tokenize(s)
        s_true_index = pd.MultiIndex.from_tuples([(0, "Bye"), (0, "Hi"),
                                                  (1, "Bye"), (1, "Test")], )
        s_true = pd.Series([_tfidf(x[1], s, x[0]) for x in s_true_index],
                           index=s_true_index).astype("Sparse")

        self.assertEqual(representation.tfidf(s), s_true)
Esempio n. 4
0
 def test_tfidf_formula(self):
     s = pd.Series(["Hi Bye", "Test Bye Bye"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([
         [
             1.0 * (math.log(3 / 3) + 1),
             1.0 * (math.log(3 / 2) + 1),
             0.0 * (math.log(3 / 2) + 1),
         ],
         [
             2.0 * (math.log(3 / 3) + 1),
             0.0 * (math.log(3 / 2) + 1),
             1.0 * (math.log(3 / 2) + 1),
         ],
     ])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s), s_true)
Esempio n. 5
0
 def test_idf_single_lowercase(self):
     s = pd.Series("ONE one")
     s_true = pd.Series([[1.0]])
     self.assertEqual(representation.tfidf(s, lowercase=True), s_true)
Esempio n. 6
0
 def test_idf_single_document(self):
     s = pd.Series("a")
     s_true = pd.Series([[1]])
     self.assertEqual(representation.tfidf(s), s_true)
Esempio n. 7
0
 def test_tfidf_single_document(self):
     s = pd.Series("a", index=["yo"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1]], index=["yo"])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s), s_true)
Esempio n. 8
0
 def test_tfidf_max_df(self):
     s = pd.Series([["one"], ["one", "two"]])
     s_true = pd.Series([[0.0], [1.4054651081081644]])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s, max_df=1), s_true)
Esempio n. 9
0
 def test_tfidf_min_df(self):
     s = pd.Series([["one"], ["one", "two"]])
     s_true = pd.Series([[1.0], [1.0]])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s, min_df=2), s_true)
Esempio n. 10
0
 def test_tfidf_max_features(self):
     s = pd.Series("one one two")
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[2.0]])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s, max_features=1), s_true)
Esempio n. 11
0
 def test_tfidf_single_not_lowercase(self):
     s = pd.Series("ONE one")
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1.0, 1.0]])
     s_true.rename_axis("document", inplace=True)
     self.assertEqual(representation.tfidf(s), s_true)
Esempio n. 12
0
 def test_idf_not_tokenized_yet(self):
     s = pd.Series("a")
     s_true = pd.Series([[1]])
     self.assertEqual(representation.tfidf(s), s_true)
Esempio n. 13
0
]

test_cases_representation = [
    [
        "count",
        lambda x: representation.flatten(representation.count(x)),
        (s_tokenized_lists, ),
    ],
    [
        "term_frequency",
        lambda x: representation.flatten(representation.term_frequency(x)),
        (s_tokenized_lists, ),
    ],
    [
        "tfidf",
        lambda x: representation.flatten(representation.tfidf(x)),
        (s_tokenized_lists, ),
    ],
    ["pca", representation.pca, (s_numeric_lists, 0)],
    ["nmf", representation.nmf, (s_numeric_lists, )],
    ["tsne", representation.tsne, (s_numeric_lists, )],
    ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
    ["dbscan", representation.dbscan, (s_numeric_lists, )],
    ["meanshift", representation.meanshift, (s_numeric_lists, )],
]

test_cases_visualization = []

test_cases = (test_cases_nlp + test_cases_preprocessing +
              test_cases_representation + test_cases_visualization)