def cluster(self): method = self.options.get_cluster_method() if method == 'kmeans': self.clusters = kmeans.cluster(self.options, self.data) else: raise Exception('Unknown cluster methiod: ' + method)
def gmm_init(k, samples): """ init a gauss mixture model for all samples using kmeans algorithm weights don't sum up to 1 """ centers = km.kmeans(k, samples) clusters = km.cluster(samples, centers) #params is a list of (mean, sigma, weight) # shapec = np.shape(centers[0]) shapes = np.shape(np.outer(samples[0], samples[0])) #params = [[np.zeros_like(centers[0]), np.zeros(shapes), 0]]*k params = [None] * k for i in range(k): cluster, center = clusters[i], centers[i] num_samples = len(cluster) deviation = np.zeros(shapes) for sample in cluster: diff = sample - center deviation += np.outer(diff, diff) deviation /= len(cluster) params[i] = [center, deviation, num_samples] return params
def extract_centroids_histogram(descs, k = DEFAULT_K_CLUSTERS): if len(descs) == 0: return [], [] print "Performing clustering on " + str(len(descs)) + " descriptors (k=" + str(k) + ")..." # Perform clustering to find the best grouping of the descriptors centroids, hist = kmeans.cluster(descs, k) print "Found " + str(len(centroids)) + " clusters in training descriptors " hist = normalize_hist(hist, k) return centroids, hist
def extract_centroids_histogram(descs, k=DEFAULT_K_CLUSTERS): if len(descs) == 0: return [], [] print "Performing clustering on " + str( len(descs)) + " descriptors (k=" + str(k) + ")..." # Perform clustering to find the best grouping of the descriptors centroids, hist = kmeans.cluster(descs, k) print "Found " + str(len(centroids)) + " clusters in training descriptors " hist = normalize_hist(hist, k) return centroids, hist
def cluster(data, method, k): print 'clustering with method: ', method, ' and k: ', k clusters = [] human_needed = True while human_needed: if method == 'kmeans': duplicates = False unique_labels = [] # TODO: move all of this out into a separate function # 1. cluster with kmeans centroids = kmeans.cluster(data, k) # 2. ask for human to label clusters for cent in centroids: if cent not in human_points: # display centroid display_char(cent, 16, 8, 'Centroid') # ask for label from h00man label = raw_input('Please label this centroid: ') # save point with label to labeled_points human_points.append(cent) human_labels.append(label) if label not in unique_labels: unique_labels.append(label) else: k -= 1 duplicates = True # unhashable type... #human[cent] = label # TODO # how to check for duplicates? # 3. ask if there needs to be more clusters if not duplicates: more = raw_input('Is ' + str(k) + ' enough clusters? (yes/no): ') if more == 'no': k += 1 else: human_needed = False clusters = kmeans.fit(data, centroids) else: print 'Removing duplicate clusters' else: # who knows what will go here? long-term goals print method.m, ' is not a valid method right now. sorry!' return clusters
def processData(): articles = getArticles() articlesDict = {'articles': articles} with open('Articles.txt', 'w') as outfile: json.dump(articlesDict, outfile) clusters = kmeans.cluster(5, articles) data = {'clusters': clusters} return data
def test_K((k, corpus_filename, algorithm)): try: X, labels = Xy(corpus_filename) pred_clusters, centers = cluster( X, seed=1, n_clusters=k, alg=algorithm ) clusters_data = extract_clusters(X, pred_clusters, labels, k) metric = wssse(clusters_data, centers) except Exception as e: metric = e return k, metric
def test_vs_sklearn(self): """Compare results with scikit-learn implementation""" data = skdatasets.load_iris().data num_clusters = 3 # Use Erisoglu as it is deterministic seeds = erisoglu.generate(data, num_clusters) mine = mykm.cluster(data, num_clusters, seeds) theirs = skcluster.KMeans(n_clusters=num_clusters, n_init=1, init=seeds) theirs.fit(data) # Assert same centroids np.testing.assert_array_almost_equal(mine['centroids'], theirs.cluster_centers_, decimal=6) # Assert SSE calculated correctly self.assertAlmostEqual(mine['inertia'], theirs.inertia_, places=8)
import dataset import kmeans import matplotlib.pyplot as plt import numpy as np # Load dataset iris_data = dataset.load_dataset('iris.csv') # Convert class names to numeric representations iris_data, iris_classes = dataset.to_numeric(iris_data, 'species') # Convert dataframe strings to floats attrs_conv = list(iris_data.axes[1][:-1]) data = dataset.from_str(iris_data, attrs_conv) # Convert dataset to matrix representation iris_ds = dataset.to_matrix(iris_data) print(type(iris_ds)) # Perform k-means clustering centroids, cluster_assignments, iters, orig_centroids = kmeans.cluster(np.delete(iris_ds, 4, 1), 3) # Output results print ('Number of iterations:', iters) print ('\nFinal centroids:\n', centroids) print ('\nCluster membership and error of first 10 instances:\n', cluster_assignments[:10]) print ('\nOriginal centroids:\n', orig_centroids)
# plot cost function for different values of K # to get the optimum K with elbow method import numpy as np import matplotlib.pyplot as plt from kmeans import cluster J = [] # list to hold the costs for various K low, high = 1, 10 # bounds on K to analyze for K in range(low, high): result = cluster(K) cost, length = 0, 0 for i in range(K): cl = np.array(result[i]) mu = np.mean(cl, axis=0) cost += np.sum((cl - mu)**2) length += np.size(cl, axis=0) J.append(cost / length) plt.figure() plt.style.use("seaborn") plt.plot(range(low, high), J, "r--") plt.show()
def clusterFeatures(features): ret = [] for feat in features: centers, codes, weights = kmeans.cluster(feat, 500) ret.append((centers, weights)) return ret
import kmeans import random import pandas as pd import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np if __name__ == "__main__": k = 3 init_flag = False while not (init_flag): print("\n loading dataset") feature_vectors = kmeans.gen_feature_vectors("iris.data.txt") print ("\n Initializing cluster centres") centers = random.sample(feature_vectors, k) clustered_data = kmeans.cluster(centers, feature_vectors, True) #not that this is only an initial clustering print("\ncounting cluster members for each cluster") new_count = kmeans.count_elements(clustered_data, k) print("\n verifying that the clusters are acceptable") init_flag = kmeans.check_count(new_count, k) if not (init_flag): print("\n Zero cluster is detected .. reinitializing algorithm") else: print("\n clusters are acceptable .. proceeding for optimization of cluster centers") converge_flag = False counter = 1 while not bool(converge_flag): print("\n Iteration no :", counter) old_count = new_count centers = kmeans.calculate_centers(clustered_data, k)
# record the results of k-means clustering # using the optimum value of K as obtained by elbow method import json from kmeans import cluster # user inputs K after assessing the elbow plot K = int(input()) results = cluster(K) filename = "results.json" with open(filename, "w") as f: json.dump(results, f) # store clustering results
sns.set() import kmeans from util import dataset points = np.vstack(((np.random.randn(150, 2) * 0.75 + np.array([1, 0])), (np.random.randn(50, 2) * 0.25 + np.array([-0.5, 0.5])), (np.random.randn(50, 2) * 0.5 + np.array([-0.5, -0.5])))) dataset = dataset(points) dataset.reduce(5) plt.scatter(points[:, 0], points[:, 1]) #ax = plt.gca() #ax.add_artist(plt.Circle(np.array([1, 0]), 0.75/2, fill=False, lw=3)) #ax.add_artist(plt.Circle(np.array([-0.5, 0.5]), 0.25/2, fill=False, lw=3)) #ax.add_artist(plt.Circle(np.array([-0.5, -0.5]), 0.5/2, fill=False, lw=3)) #centroids = kmeans.cluster(points, 3) centroids, closest = kmeans.cluster(dataset.reduced_data, 3) arg1 = np.argwhere(closest == 0) cluster1 = np.array(points[arg1]) print(cluster1) plt.scatter(cluster1[:, 0], cluster1[:, 1], c='g') whole = np.insert(points, 2, closest, axis=1) print(whole) print(centroids[:, 0]) plt.scatter(centroids[:, 0], centroids[:, 1], c='r', s=100)