Exemple #1
0
class ClusteringTest(TestCase):
    def setUp(self):
        self.reader = Reader(filename)
        self.clusterer = Kmeans(3)

    def test_01_courses(self):
        courses = self.reader.courses  #returns list of courses
        self.assertEqual(courses[:3],
                         ['Bioinformatik', 'Informatik', 'Mathematik'])

    def test_02_normalize(self):
        word = "(Studienrichtung"
        normalized_word = self.reader.normalize_word(
            word)  #returns list of courses
        self.assertEqual(normalized_word, "studienrichtung")

    def test_03_vocabulary(self):
        words = self.reader.vocabulary
        self.assertEqual(words[:3], ['albanologie', 'allgemeine', 'als'])

    def test_04_vectorspaced(self):
        word_to_vectorspace = self.reader.vectorspaced("Slavische Philologie")
        vocab_size = len(self.reader.vocabulary)

        self.assertEqual(vocab_size, len(word_to_vectorspace))

        self.assertEqual(word_to_vectorspace, [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0
        ])

    def test_05_distance(self):
        a = [1, 2, 3]
        b = [4, 5, 6]
        euclidean_dist = self.clusterer.distance(a, b)
        self.assertEqual(int(euclidean_dist), 5)

    def test_06_vector_mean(self):
        vectors = [[1, 2, 3], [4, 5, 6]]
        mean = self.clusterer.vector_mean(vectors)
        self.assertEqual(mean, [2.5, 3.5, 4.5])

    def test_07_classify(self):
        vectorspaced_data = self.reader.vector_spaced_data

        #clusters are always differrent
        self.clusterer.train(vectorspaced_data)
        clusters = [self.clusterer.classify(vec) for vec in vectorspaced_data]
        self.assertEqual(len(clusters), len(vectorspaced_data))
from kmeans import Kmeans 
import numpy as np
data = np.array([[1.9,1.5,0.4,0.4,0.1,0.2,2.0,0.3,0.1],[2.3,2.5,0.2,1.8,0.1,1.8,2.5,1.5,0.3]])
codes = 3
km = Kmeans(data,codes)
print 'Class labels = ', km.label
print('Due to the random initialization, different (wrong) labels\nare often returned')    
x = np.array([0.25,2.0])
print km.classify(x)
#km.label = lab2
km.plot()
print('Verify the answer using the graph.')
# Specify the initial cluster means.        
codes =  np.array([data[:,0],data[:,2],data[:,3]]).T 
km = Kmeans(data,codes)  
print 'Clusters = ',km.cluster
print 'Class labels = ', km.label

Exemple #3
0
from kmeans import Kmeans
import numpy as np
data = np.array([[1.9, 1.5, 0.4, 0.4, 0.1, 0.2, 2.0, 0.3, 0.1],
                 [2.3, 2.5, 0.2, 1.8, 0.1, 1.8, 2.5, 1.5, 0.3]])
codes = 3
km = Kmeans(data, codes)
print 'Class labels = ', km.label
print(
    'Due to the random initialization, different (wrong) labels\nare often returned'
)
x = np.array([0.25, 2.0])
print km.classify(x)
#km.label = lab2
km.plot()
print('Verify the answer using the graph.')
# Specify the initial cluster means.
codes = np.array([data[:, 0], data[:, 2], data[:, 3]]).T
km = Kmeans(data, codes)
print 'Clusters = ', km.cluster
print 'Class labels = ', km.label
Exemple #4
0
from kmeans import Reader
from kmeans import Kmeans

filename = "../data/courses.txt"

reader = Reader(filename)

# returns list of courses
courses = reader.courses

#set of all words from file
words = reader.vocabulary
print("vocabulary size:", len(words))

vectorspaced_data = reader.vector_spaced_data

clusterer = Kmeans(10)
clusterer.train(vectorspaced_data)

data = [(clusterer.classify(vec), course)
        for vec, course in zip(vectorspaced_data, courses)]
sorted = sorted(data, key=lambda x: x[0])

for cluster, course in sorted:
    print(cluster, course)