コード例 #1
0
class ClusteringTest(TestCase):
    def setUp(self):
        self.reader = Reader(filename)
        self.clusterer = Kmeans(3)

    def test_01_courses(self):
        courses = self.reader.courses  #returns list of courses
        self.assertEqual(courses[:3],
                         ['Bioinformatik', 'Informatik', 'Mathematik'])

    def test_02_normalize(self):
        word = "(Studienrichtung"
        normalized_word = self.reader.normalize_word(
            word)  #returns list of courses
        self.assertEqual(normalized_word, "studienrichtung")

    def test_03_vocabulary(self):
        words = self.reader.vocabulary
        self.assertEqual(words[:3], ['albanologie', 'allgemeine', 'als'])

    def test_04_vectorspaced(self):
        word_to_vectorspace = self.reader.vectorspaced("Slavische Philologie")
        vocab_size = len(self.reader.vocabulary)

        self.assertEqual(vocab_size, len(word_to_vectorspace))

        self.assertEqual(word_to_vectorspace, [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0
        ])

    def test_05_distance(self):
        a = [1, 2, 3]
        b = [4, 5, 6]
        euclidean_dist = self.clusterer.distance(a, b)
        self.assertEqual(int(euclidean_dist), 5)

    def test_06_vector_mean(self):
        vectors = [[1, 2, 3], [4, 5, 6]]
        mean = self.clusterer.vector_mean(vectors)
        self.assertEqual(mean, [2.5, 3.5, 4.5])

    def test_07_classify(self):
        vectorspaced_data = self.reader.vector_spaced_data

        #clusters are always differrent
        self.clusterer.train(vectorspaced_data)
        clusters = [self.clusterer.classify(vec) for vec in vectorspaced_data]
        self.assertEqual(len(clusters), len(vectorspaced_data))