class ClusteringTest(TestCase): def setUp(self): self.reader = Reader(filename) self.clusterer = Kmeans(3) def test_01_courses(self): courses = self.reader.courses #returns list of courses self.assertEqual(courses[:3], ['Bioinformatik', 'Informatik', 'Mathematik']) def test_02_normalize(self): word = "(Studienrichtung" normalized_word = self.reader.normalize_word( word) #returns list of courses self.assertEqual(normalized_word, "studienrichtung") def test_03_vocabulary(self): words = self.reader.vocabulary self.assertEqual(words[:3], ['albanologie', 'allgemeine', 'als']) def test_04_vectorspaced(self): word_to_vectorspace = self.reader.vectorspaced("Slavische Philologie") vocab_size = len(self.reader.vocabulary) self.assertEqual(vocab_size, len(word_to_vectorspace)) self.assertEqual(word_to_vectorspace, [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) def test_05_distance(self): a = [1, 2, 3] b = [4, 5, 6] euclidean_dist = self.clusterer.distance(a, b) self.assertEqual(int(euclidean_dist), 5) def test_06_vector_mean(self): vectors = [[1, 2, 3], [4, 5, 6]] mean = self.clusterer.vector_mean(vectors) self.assertEqual(mean, [2.5, 3.5, 4.5]) def test_07_classify(self): vectorspaced_data = self.reader.vector_spaced_data #clusters are always differrent self.clusterer.train(vectorspaced_data) clusters = [self.clusterer.classify(vec) for vec in vectorspaced_data] self.assertEqual(len(clusters), len(vectorspaced_data))
from kmeans import Kmeans import numpy as np data = np.array([[1.9,1.5,0.4,0.4,0.1,0.2,2.0,0.3,0.1],[2.3,2.5,0.2,1.8,0.1,1.8,2.5,1.5,0.3]]) codes = 3 km = Kmeans(data,codes) print 'Class labels = ', km.label print('Due to the random initialization, different (wrong) labels\nare often returned') x = np.array([0.25,2.0]) print km.classify(x) #km.label = lab2 km.plot() print('Verify the answer using the graph.') # Specify the initial cluster means. codes = np.array([data[:,0],data[:,2],data[:,3]]).T km = Kmeans(data,codes) print 'Clusters = ',km.cluster print 'Class labels = ', km.label
from kmeans import Kmeans import numpy as np data = np.array([[1.9, 1.5, 0.4, 0.4, 0.1, 0.2, 2.0, 0.3, 0.1], [2.3, 2.5, 0.2, 1.8, 0.1, 1.8, 2.5, 1.5, 0.3]]) codes = 3 km = Kmeans(data, codes) print 'Class labels = ', km.label print( 'Due to the random initialization, different (wrong) labels\nare often returned' ) x = np.array([0.25, 2.0]) print km.classify(x) #km.label = lab2 km.plot() print('Verify the answer using the graph.') # Specify the initial cluster means. codes = np.array([data[:, 0], data[:, 2], data[:, 3]]).T km = Kmeans(data, codes) print 'Clusters = ', km.cluster print 'Class labels = ', km.label
from kmeans import Reader from kmeans import Kmeans filename = "../data/courses.txt" reader = Reader(filename) # returns list of courses courses = reader.courses #set of all words from file words = reader.vocabulary print("vocabulary size:", len(words)) vectorspaced_data = reader.vector_spaced_data clusterer = Kmeans(10) clusterer.train(vectorspaced_data) data = [(clusterer.classify(vec), course) for vec, course in zip(vectorspaced_data, courses)] sorted = sorted(data, key=lambda x: x[0]) for cluster, course in sorted: print(cluster, course)