Ejemplo n.º 1
0
def apply_kmeans(do_pca, x_train, y_train, x_test, y_test, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################
    max_repeat = 7
    for repeat in range(1, max_repeat):
        if repeat == 1:
            for k in range(1, kmeans_max_k):
                kmeans = KMeans(k, kmeans_max_iter)
                sse_vs_iter = kmeans.fit(x_train)
                train_sses_vs_iter.append(sse_vs_iter)
                train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
                train_sses_vs_k.append(min(sse_vs_iter))
        elif repeat == max_repeat-1:
            for k in range(1, kmeans_max_k):
                kmeans = KMeans(k, kmeans_max_iter)
                sse_vs_iter = kmeans.fit(x_train)
                train_sses_vs_iter[k-1] += sse_vs_iter[k-1]
                train_sses_vs_iter[k-1] = train_sses_vs_iter[k-1]/repeat
                
                train_purities_vs_k[k-1] += kmeans.get_purity(x_train, y_train)
                train_purities_vs_k[k-1] = train_purities_vs_k[k-1] / repeat
                print("Purity: ", train_purities_vs_k[k-1])

                train_sses_vs_k[k-1] += min(sse_vs_iter)
                train_sses_vs_k[k-1] = train_sses_vs_k[k-1]/repeat

        else:
            for k in range(1, kmeans_max_k):
                kmeans = KMeans(k, kmeans_max_iter)
                sse_vs_iter = kmeans.fit(x_train)
                train_sses_vs_iter[k-1] += sse_vs_iter[k-1]
                train_purities_vs_k[k-1] += kmeans.get_purity(x_train, y_train)
                train_sses_vs_k[k-1] += min(sse_vs_iter)

    

    plot_y_vs_x_list(train_sses_vs_iter, x_label='iter', y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d'%do_pca)
    plot_y_vs_x(train_sses_vs_k, x_label='k', y_label='sse',
                save_path='plot_sse_vs_k_%d'%do_pca)
    plot_y_vs_x(train_purities_vs_k, x_label='k', y_label='purities',
                save_path='plot_purity_vs_k_%d'%do_pca)
Ejemplo n.º 2
0
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    start = time.time()
    for k in range(1, kmeans_max_k):
        print("On step k =", k, "of", kmeans_max_k,
              "\telapsed time: %.2f" % (time.time() - start), "s")
        kmeans = KMeans(k, kmeans_max_iter)
        sse_vs_iter = kmeans.fit(x_train)
        train_sses_vs_iter.append(sse_vs_iter)
        train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
        train_sses_vs_k.append(min(sse_vs_iter))

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(train_sses_vs_k,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
    plot_y_vs_x(train_purities_vs_k,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Ejemplo n.º 3
0
def apply_kmeans(do_pca, x_train, y_train, x_test, y_test, kmeans_max_iter,
                 kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    for k in range(1, kmeans_max_k):
        kmeans = KMeans(k, kmeans_max_iter)
        sse_vs_iter = kmeans.fit(x_train)
        train_sses_vs_iter.append(sse_vs_iter)
        train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
        train_sses_vs_k.append(min(sse_vs_iter))

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(train_sses_vs_k,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
    plot_y_vs_x(train_purities_vs_k,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Ejemplo n.º 4
0
def apply_kmeans3(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    result = []
    for k in range(1, 11):
        print('k:', k)
        for times in range(0, 5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse_vs_iter = kmeans.fit(x_train)
            train_sses_vs_iter.append(sse_vs_iter)
            train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
            train_sses_vs_k.append(min(sse_vs_iter))
        print(train_purities_vs_k)
        avg = sum(train_purities_vs_k) / len(train_purities_vs_k)
        result.append(avg)
        train_purities_vs_k = []

    print(result)
    print('max purity', max(result))
    plot_y_vs_x(result,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Ejemplo n.º 5
0
def apply_kmeans1(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    for run in range(0, 5):
        kmeans = KMeans(6, kmeans_max_iter)
        sse_vs_iter = kmeans.fit(x_train)
        train_sses_vs_iter.append(sse_vs_iter)
        train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
        train_sses_vs_k.append(min(sse_vs_iter))

    result = []
    for col in range(len(train_sses_vs_iter[0])):
        sum = 0
        for row in range(0, 5):
            sum += train_sses_vs_iter[row][col]
        sum = sum / 5
        result.append(sum)
    result = [result]

    print(result)

    plot_y_vs_x_list(result,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################
    # iterations for 5 different runs of k-means.
    for k in range(0, 5):
        kmeans = KMeans(6, kmeans_max_iter)
        sse_vs_iter = kmeans.fit(x_train)
        train_sses_vs_iter.append(sse_vs_iter)
        train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
        train_sses_vs_k.append(min(sse_vs_iter))
        if k == 0:
            avg_list = [0] * len(sse_vs_iter)
        avg_list = [
            avg_list[i] + sse_vs_iter[i] for i in range(len(sse_vs_iter))
        ]

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(avg_list,
                x_label='iterations',
                y_label='sse',
                save_path='plot_sse_vs_iter_%d' % do_pca)
Ejemplo n.º 7
0
def main(dataset_fn, output_fn, clusters_no):
    geo_locs = []
    # read location data from csv file and store each location as a Point(latit,longit) object
    df = pd.read_csv(dataset_fn)
    for index, row in df.iterrows():
        loc_ = Point(float(row['LAT']), float(row['LON']))  #tuples for location
        geo_locs.append(loc_)
    # run k_means clustering
    model = KMeans(geo_locs, clusters_no)
    flag = model.fit(True)
    if flag == -1:
        print("No of points are less than cluster number!")
    else:
        # save clustering results is a list of lists where each list represents one cluster
        model.save(output_fn)
 def _fit(self, X):
     cov = np.cov(X.T)
     kmeans = KMeans(self.n_components)
     kmeans.fit(X)
     self.mu = kmeans.centers
     self.cov = np.array([cov for _ in range(self.n_components)])
     self.coef = np.ones(self.n_components) / self.n_components
     params = np.hstack(
         (self.mu.ravel(),
          self.cov.ravel(),
          self.coef.ravel())
     )
     while True:
         stats = self._expectation(X)
         self._maximization(X, stats)
         new_params = np.hstack(
             (self.mu.ravel(),
              self.cov.ravel(),
              self.coef.ravel())
         )
         if np.allclose(params, new_params):
             break
         else:
             params = new_params
Ejemplo n.º 9
0
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################

    for k in range(1, kmeans_max_k):
        sses = None
        avg_purity = 0.

        # do five tests to reduce effect of random start
        for i in range(5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse = kmeans.fit(x_train)
            if (sses == None):
                sses = sse
            else:
                for j in range(len(sse)):
                    sses[j] = (sses[j] + sse[j])

            avg_purity += kmeans.get_purity(x_train, y_train)

        avg_purity = avg_purity / 5.

        for j in range(len(sses)):
            sses[j] = sses[j] / 5.0
        # avg_sses = np.sum(np.array(sses), 0) / 5

        train_sses_vs_iter.append(sses)
        train_purities_vs_k.append(avg_purity)
        train_sses_vs_k.append(min(sses))

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(train_sses_vs_k,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
    plot_y_vs_x(train_purities_vs_k,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Ejemplo n.º 10
0
def apply_kmeans_2(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    print('kmeans\n')
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []
    avg_me = []

    for k in range(1, 11):
        for it in range(0, 5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse_vs_iter = kmeans.fit(x_train)
            train_sses_vs_iter.append(sse_vs_iter)
            train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
            train_sses_vs_k.append(min(sse_vs_iter))
        avg_me.append((sum(train_sses_vs_k) / len(train_sses_vs_k)))

    plot_y_vs_x(avg_me,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
Ejemplo n.º 11
0
def main():
    k = 3
    X =   [[random.randint(0,20),random.randint(0,20)] for i in range(30)]       \
        + [[random.randint(40,60), random.randint(40,60)] for i in range(30)]    \
        + [[random.randint(80, 100), random.randint(80, 100)] for i in range(30)]

    print(f"Cluster points:{X}")

    kmeans = KMeans(n_cluster=k, tol=3e-4)
    centroids = kmeans.fit(X)
    prediction = kmeans.predict([[0.0,0.0],[50.0,40.0],[100.0,100.0]])

    print(f"KMeans centroids: {centroids}")
    print(f"KMeans predict for [0,0],[50,40],[100,100]]: {prediction}")

    colors = ['r', 'g', 'b']
    for i in range(k):
            plt.scatter([x[0] for x in X], [x[1] for x in X], s=7, c=colors[i])
    plt.scatter([x[0] for x in centroids], [x[1] for x in centroids], marker='*', s=200, c='black')
    plt.show()
Ejemplo n.º 12
0
def apply_kmeans_3(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []
    averg_list = []

    for k in range(1, 11):
        for it in range(0, 5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse_vs_iter = kmeans.fit(x_train)
            train_sses_vs_iter.append(sse_vs_iter)
            train_purities_vs_k.append(kmeans.get_purity(x_train, y_train))
            train_sses_vs_k.append(min(sse_vs_iter))
        averg_list.append(
            (sum(train_purities_vs_k) / len(train_purities_vs_k)))
    #plot the average purity
    plot_y_vs_x(averg_list,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
Ejemplo n.º 13
0
def main(dataset_fn, output_fn, clusters_no, w):
    geo_locs = []
    # read location data from csv file and store each location as a Point(latit,longit) object
    df = pd.read_csv(dataset_fn)
    for index, row in df.iterrows():
        loc_ = Node(
            [float(row['X']),
             float(row['Y']),
             float(row['PreChange'])], row['ID'])
        geo_locs.append(loc_)
    # run k_means clustering
    w = np.array(w)
    model = KMeans(geo_locs, clusters_no, w)
    flag = model.fit(True)
    if flag == -1:
        print("No of points are less than cluster number!")
    else:
        # save clustering results is a list of lists where each list represents one cluster
        model.save(output_fn)
        model.showresult(True)
Ejemplo n.º 14
0
def apply_kmeans_avg(x_train, y_train, kmeans_max_iter, k, iterations=5):
    train_sses_vs_iter = None
    sse = 0
    purity = 0
    print("")
    for step in range(iterations):
        print("On step ", step + 1, "of", iterations, "for k =", k)
        kmeans = KMeans(k, kmeans_max_iter)
        sse_vs_iter_loop = np.array(kmeans.fit(x_train))

        # initialize the train sse array
        if train_sses_vs_iter is None:
            train_sses_vs_iter = np.zeros(len(sse_vs_iter_loop))

        train_sses_vs_iter += sse_vs_iter_loop

        purity += kmeans.get_purity(x_train, y_train)
        sse += sse_vs_iter_loop.min()

    return (train_sses_vs_iter /
            iterations).tolist(), sse / iterations, purity / iterations
def apply_kmeans(do_pca, x_train, y_train, kmeans_max_iter, kmeans_max_k):
    train_sses_vs_iter = []
    train_sses_vs_k = []
    train_purities_vs_k = []

    ##################################
    #      YOUR CODE GOES HERE       #
    ##################################
    sses_sum = 0
    purities_sum = 0
    for k in range(1, kmeans_max_k):
        # for k in range(1, 6):
        for i in range(5):
            kmeans = KMeans(k, kmeans_max_iter)
            sse_vs_iter = kmeans.fit(x_train)
            sses_sum += min(sse_vs_iter)
            purities_sum += kmeans.get_purity(x_train, y_train)
        print(k)
        sses_sum /= 5
        purities_sum /= 5
        train_sses_vs_k.append(sses_sum)
        train_purities_vs_k.append(purities_sum)
    print(train_sses_vs_k)
    print(train_purities_vs_k)

    plot_y_vs_x_list(train_sses_vs_iter,
                     x_label='iter',
                     y_label='sse',
                     save_path='plot_sse_vs_k_subplots_%d' % do_pca)
    plot_y_vs_x(train_sses_vs_k,
                x_label='k',
                y_label='sse',
                save_path='plot_sse_vs_k_%d' % do_pca)
    plot_y_vs_x(train_purities_vs_k,
                x_label='k',
                y_label='purities',
                save_path='plot_purity_vs_k_%d' % do_pca)
	def _derive_initial_parameters(self, x):
		km = KMeans(k=self.k, tol=self.tol, max_iter=self.max_iter)
		km.fit(x)
		return km.means, km.covariance_matrices, km.cluster_probabilities
Ejemplo n.º 17
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from clustering import KMeans

data = np.array(
    [1, 1, 0, 0, 0, 1, 1, 0, 5, 5, 5, 7, 7, 5, 6, 6, 0, 9, 0, 8, 1, 8, 1, 9])
data = np.reshape(data, (12, 2))

print(data)

model = KMeans(k=3, iterations=10)
model.fit(data)
print(model.centroids_)
labels = model.label(data)

print(labels)

data = pd.DataFrame(data)
print(data)

for ki in range(3):
    plt.scatter(data[labels == ki][0], data[labels == ki][1])

for c in model.centroids_:
    plt.scatter(c[0], c[1], c='k', marker='x')

plt.show()
Ejemplo n.º 18
0
class ClusteringTest(unittest.TestCase):
    def setUp(self):
        random.seed(1)
        self.sc = SparkContext(master='local')
        self.points = self.sc.parallelize([
            (1, 1, 0),
            (1, 2, 2),
            (2, 2, 2),
            (2, 1, 0),

            (8, 0, 0),
            (8, 1, 2),
            (9, 1, 0),
            (9, 2, 2),

            (1, 1, 7),
            (1, 2, 9),
            (2, 2, 9),
            (2, 1, 7),
        ])
        self.k = 3
        self.kmeans = KMeans(self.k)

    def tearDown(self):
        self.sc.stop()

    def test_find_outer_vertices(self):
        edges = KMeans.find_outer_vertices(self.points)
        self.assertEqual(edges, (1, 9, 0, 2, 0, 9))

    def test_assign_points_to_centroids(self):
        centroids = {
            (1.5, 1.5, 1.0): [],
            (1.5, 1.5, 8.0): [],
            (8.5, 1.0, 1.0): [],
        }
        data = self.kmeans.assign_points_to_centroids(centroids, self.points).collectAsMap()
        self.assertDictEqual(data, {
            (1.5, 1.5, 1.0): [
                (1, 1, 0),
                (1, 2, 2),
                (2, 2, 2),
                (2, 1, 0),
            ],
            (1.5, 1.5, 8.0): [
                (1, 1, 7),
                (1, 2, 9),
                (2, 2, 9),
                (2, 1, 7),
            ],
            (8.5, 1.0, 1.0): [
                (8, 0, 0),
                (8, 1, 2),
                (9, 1, 0),
                (9, 2, 2),
            ],
        })

    def test_recalculate_cluster_centroids(self):
        centroids = [
            (
                (10, 11), [(1, 2), (3, 4)]
            ),
            (
                (12, 13), [(2, 3), (4, 5), (6, 7)]
            ),
        ]
        clusters = self.sc.parallelize(centroids).groupByKey().flatMapValues(lambda x: x)

        data = self.kmeans.recalculate_cluster_centroids(clusters).collectAsMap()
        self.assertDictEqual(data, {
            (2, 3): [
                (1, 2),
                (3, 4),
            ],
            (4, 5): [
                (2, 3),
                (4, 5),
                (6, 7),
            ],
        })

    def test_calculate_centroid(self):
        points = [
            (1, 2, 3),
            (3, 4, 5),
        ]
        centroids = KMeans.calculate_centroid(points)
        self.assertEqual(centroids, (2, 3, 4))

    def test_fit(self):
        points = self.sc.parallelize([
            (1, 1), (1, 3), (3, 1), (3, 3),
            (5, 6), (5, 8), (7, 6), (7, 8),
            (10, 2), (10, 4), (12, 2), (12, 4),
        ])
        centroids = self.kmeans.fit(points).collectAsMap()
        self.assertEqual(len(centroids), self.k)
        self.assertDictEqual(centroids, {
            (0.090909090909090912, 0.14285714285714285): [
                (0.0, 0.0),
                (0.0, 0.2857142857142857),
                (0.18181818181818182, 0.0),
                (0.18181818181818182, 0.2857142857142857),
            ],
            (0.45454545454545453, 0.85714285714285721): [
                (0.36363636363636365, 0.7142857142857143),
                (0.36363636363636365, 1.0),
                (0.5454545454545454, 0.7142857142857143),
                (0.5454545454545454, 1.0),
            ],
            (0.9090909090909092, 0.2857142857142857): [
                (0.8181818181818182, 0.14285714285714285),
                (0.8181818181818182, 0.42857142857142855),
                (1.0, 0.14285714285714285),
                (1.0, 0.42857142857142855),
            ],
        })

    def test_calculate_distance(self):
        a = (3, 5, 8, 15)
        b = (2, 3, 4, 5)
        distance = KMeans.calculate_distance(a, b)
        self.assertEqual(distance, 11)

    def test_calculate_average_distance(self):
        centre = (1, 1)
        points = [
            (4, 1),
            (5, 4),
        ]
        self.assertEqual(KMeans.calculate_average_distance(centre, points), 4)

    def test_normalize_data(self):
        points = self.sc.parallelize([
            (10, 2, 50, 11),
            (25, 4, 55, 12),
            (80, 9, 60, 19),
            (100, 10, 65, 21),
        ])
        result = self.kmeans.normalize_data(points, 4).collect()
        self.assertEqual(result, [
            (0, 0, 0, 0),
            (1/6.0, 1/4.0, 1/3.0, 1/10.0),
            (7/9.0, 7/8.0, 2/3.0, 4/5.0),
            (1, 1, 1, 1),
        ])

    def test_normalize_data__single_column(self):
        points = self.sc.parallelize([
            (1,),
            (2,),
            (3,),
        ])
        result = self.kmeans.normalize_data(points, 1).collect()
        self.assertEqual(result, [
            (0,),
            (0.5,),
            (1,),
        ])
Ejemplo n.º 19
0
def kmeans_area_example():
    from time import time
    import numpy as np
    import matplotlib.pyplot as plt

    from sklearn import metrics
    from sklearn.cluster import KMeans
    from sklearn.datasets import load_digits
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import scale

    np.random.seed(42)

    digits = load_digits()
    data = scale(digits.data)

    n_samples, n_features = data.shape
    n_digits = len(np.unique(digits.target))
    labels = digits.target

    sample_size = 300

    print("n_digits: %d, \t n_samples %d, \t n_features %d" %
          (n_digits, n_samples, n_features))

    print(79 * '_')
    print(
        '% 9s' % 'init'
        '    time  inertia    h**o   compl  v-meas     ARI AMI  silhouette')

    def bench_k_means(estimator, name, data):
        t0 = time()
        estimator.fit(data)
        print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f' %
              (name, (time() - t0), estimator.inertia_,
               metrics.homogeneity_score(labels, estimator.labels_),
               metrics.completeness_score(labels, estimator.labels_),
               metrics.v_measure_score(labels, estimator.labels_),
               metrics.adjusted_rand_score(labels, estimator.labels_),
               metrics.adjusted_mutual_info_score(labels, estimator.labels_),
               metrics.silhouette_score(data,
                                        estimator.labels_,
                                        metric='euclidean',
                                        sample_size=sample_size)))

    bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
                  name="k-means++",
                  data=data)

    bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
                  name="random",
                  data=data)

    # in this case the seeding of the centers is deterministic, hence we run the
    # kmeans algorithm only once with n_init=1
    pca = PCA(n_components=n_digits).fit(data)
    bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
                  name="PCA-based",
                  data=data)
    print(79 * '_')

    reduced_data = PCA(n_components=2).fit_transform(data)
    kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
    kmeans.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # print Z
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    print Z[0]
    plt.figure(1)
    plt.clf()
    plt.imshow(
        Z,
        #interpolation='nearest',
        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
        #cmap=plt.cm.Paired,
        #aspect='auto', origin='lower'
    )

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0],
                centroids[:, 1],
                marker='x',
                s=169,
                linewidths=3,
                color='w',
                zorder=10)
    plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
              'Centroids are marked with white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()
Ejemplo n.º 20
0
 def cluster_image(self, n_clusters, algorithm="km"):
     if algorithm == "km":
         kmeans = KMeans(n_clusters=n_clusters, visualise=self.visualise)
         self.labelled_image = kmeans.fit(self.image, self.segments)