def spectral(X, sigma, k, centroids): """ Ng谱聚类算法 :param X: 数据点 :param sigma: 参数 :param k: 参数 :return: accu聚类精度 """ (n, d) = X.shape L_sym, L = get_L(X, k, sigma) eig, eigvec = np.linalg.eig(L_sym) # eigvec按列 # eig_index = np.argsort(eig)[1:d+1] eig_index = np.argsort(eig)[:d] # 最小的d个特征值的索引 U = eigvec[:, eig_index] T = np.zeros(U.shape) for i in range(n): for j in range(d): T[i][j] = U[i][j] / np.linalg.norm(U[i]) Y = T # visual(Y, k=k, sigma=sigma, save=1) cluster = KMeans(2, 100, centroids) cluster.fit(Y) labels = cluster.labels if labels[0] == 0: n1 = 100 - sum(labels[:100]) n2 = sum(labels[100:]) else: n1 = sum(labels[:100]) n2 = 100 - sum(labels[100:]) accu = (n1 + n2) / n print('---------------------sigma=%.2f, k=%d, accu=%.4f' % (sigma, k, accu)) return accu
def test_predict(self): test_samples = [[-3, -3], [3, 3], [-1, -1], [1, 1]] expected_predictions = [0, 1, 0, 1] k_means = KMeans(num_clusters=self.num_clusters, seed=1) k_means.fit(self.data) predictions = k_means.predict(test_samples) self.assertEqual(expected_predictions, predictions)
def cluster_newsgroups(): """ Cluster newsgroup categories. """ from kmeans import KMeans from similarity import simMatrix corpus, dictionary = build_dictionary(bigram=True) tfidf = TFIDF(dictionary) newsgroups = tfidf.vectorize(corpus) dictionary = tfidf.dictionary categories = sorted(corpus.keys()) N = 6 print "\n{}-Most Common Words".format(N) for index, category in enumerate(categories): nlargest = np.argpartition(newsgroups[index, :], -N)[-N:] nlargest = nlargest[np.argsort(newsgroups[index, nlargest])][::-1] print "{:>24} {}".format(category, dictionary[nlargest]) print K = 3 km = KMeans(n_clusters=K) km.fit(newsgroups) labels = km.labels_ print "\nKMeans Label Assignment, K = {}".format(K) for category, label, in zip(categories, labels): print int(label), category simMatrix(newsgroups).plot().show()
def _initialise_prams(self, X): # Get initial clusters using Kmeans kmeans = KMeans(k=self.k, max_iters=500) kmeans.fit(X) kmeans_preds = kmeans.predict(X) N, col_length = X.shape mixture_labels = np.unique(kmeans_preds) initial_mean = np.zeros((self.k, col_length)) initial_cov = np.zeros((self.k, col_length, col_length)) initial_pi = np.zeros(self.k) for index, mixture_label in enumerate(mixture_labels): mixture_indices = (kmeans_preds == mixture_label) Nk = X[mixture_indices].shape[0] # Initial pi initial_pi[index] = Nk / N # Intial mean initial_mean[index, :] = np.mean(X[mixture_indices], axis=0) # Initial covariance de_meaned = X[mixture_indices] - initial_mean[index, :] initial_cov[index] = np.dot(initial_pi[index] * de_meaned.T, de_meaned) / Nk assert np.sum(initial_pi) == 1 return initial_pi, initial_mean, initial_cov
def __init__(self, n_cluster: int, data: np.ndarray, use_kmeans: bool = False, w: float = 0.9, c1: float = 0.5, c2: float = 0.3, flag: int = 1, weights: list = None): index = np.random.choice(list(range(len(data))), n_cluster) self.centroids = data[index].copy() if use_kmeans: kmeans = KMeans(n_cluster=n_cluster, init_pp=False) kmeans.fit(data) self.centroids = kmeans.centroid.copy() self.best_position = self.centroids.copy() self.best_score = quantization_error(self.centroids, self._predict(data), data) self.flag=flag if self.flag%2==1: self.best_sse = calc_sse(self.centroids, self._predict(data), data) else: self.best_sse = calc_sse2(self.centroids, self._predict(data), data, weights) self.velocity = np.zeros_like(self.centroids) self._w = w self._c1 = c1 self._c2 = c2
def test_whole(self): """ Tests the score method. """ X, y, centers = generate_cluster_samples() n_samples = X.shape[0] n_features = X.shape[1] k = centers.shape[0] # run N_TRIALS, pick best model best_model = None for i in range(N_TRIALS): kmeans = KMeans(k, N_ITER) kmeans.fit(X) if best_model is None: best_model = kmeans elif kmeans.score(X) < best_model.score(X): best_model = kmeans # check sum squared errors sum_squared_errors = best_model.score(X) self.assertLess(sum_squared_errors / n_samples, EPS) # compare centers to expected centers smallest_distances = find_smallest_distances( best_model.cluster_centers, centers) for distance in smallest_distances: self.assertLess(distance, EPS)
def test_fit_with_different_initial_centroids(self): expected_labels = [0, 0, 0, 1, 1, 1] expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]] k_means = KMeans(num_clusters=self.num_clusters, seed=0) k_means.fit(self.data) self.assertEqual(expected_labels, k_means.labels_) np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
def B1(pca=False): ''' Plot WC_SSD and SC over K. ''' K = [2, 4, 6, 8, 16, 32] fnames = [ 'digits-embedding.csv', 'digits-embedding-2467.csv', 'digits-embedding-67.csv' ] wc_ssd_val = zeros((len(fnames), len(K))) sc_val = zeros((len(fnames), len(K))) for i, fname in enumerate(fnames): X = genfromtxt(fname, delimiter=',')[:, 2:] for j, k in enumerate(K): kmeans = KMeans(n_clusters=k) kmeans.fit(X) wc_ssd_val[i, j], sc_val[i, j], _ = kmeans.get_evals() # Plot WC_SSD figure() for i, fname in enumerate(fnames): plot(K, wc_ssd_val[i], label=fname) legend() title('WC_SSD v.s. K') figure() for i, fname in enumerate(fnames): plot(K, sc_val[i], label=fname) legend() title('SC v.s. K') show()
def main2(): df = pd.read_csv('credit_card_data.csv') df = df.fillna(df.median()) original_data = df.iloc[:, 1:].values data = copy.deepcopy(original_data) columns = list(df.columns)[1:] # lista naziva kolona print(columns) # min_max_data(df, columns) normalizacija(data) # radimo normalizaciju nad ucitanim podacima pca = PCA() pca.fit(data) # odredjujem na koliiko cu da smanjim dimenzionalnost plt.plot(range(1, 18), pca.explained_variance_ratio_.cumsum(), marker='x', linestyle='--') plt.xlabel('Components') # features plt.ylabel('Variance') plt.show() components = 7 # vidimo iz plota pca = PCA(n_components=components) pca.fit(data) scores = pca.transform(data) # print(scores) # ima onoliko komponenti koliko smo stavili # pokazujemo da prve dve dimenzije uticu najvise na grafik plt.bar(range(pca.n_components_), pca.explained_variance_ratio_, color='black') plt.xlabel('PCA components') plt.ylabel('Variance %') # procenat koliko uticu na grafik, da tako kazemo plt.xticks(range(pca.n_components_)) plt.show() # dobijam optimal k = 5 za 500 prvih ucitanih # za sve ucitane dobijam 6 # optimal_k_plot(data) broj_klastera = 6 k_means = MyKMeans(n_clusters=broj_klastera, max_iter=100) k_means.fit(scores, normalize=False) klaster_indeksi = k_means.klaster_indeksi print(klaster_indeksi) lista_klastera_sa_originalnim_podacima = [] # lista klastera sa originalnim podacima for i in range(broj_klastera): lista_klastera_sa_originalnim_podacima.append([]) for i in range(len(original_data)): lista_klastera_sa_originalnim_podacima[klaster_indeksi[i]].append(original_data[i]) # printujem osobine i stablo odlucivanja print_descriptions(lista_klastera_sa_originalnim_podacima, columns) # print_decision_tree(original_data, klaster_indeksi, columns) print_clusters_description() # iscrtavamo tacke plot_2_D(k_means)
def cluster_newsgroups(): """ Cluster newsgroup categories. """ from kmeans import KMeans from similarity import simMatrix corpus, dictionary = build_dictionary(bigram=True) tfidf = TFIDF(dictionary) newsgroups = tfidf.vectorize(corpus) dictionary = tfidf.dictionary categories = sorted(corpus.keys()) N = 6 print "\n{}-Most Common Words".format(N) for index, category in enumerate(categories): nlargest = np.argpartition(newsgroups[index,:], -N)[-N:] nlargest = nlargest[np.argsort(newsgroups[index,nlargest])][::-1] print "{:>24} {}".format(category, dictionary[nlargest]) print K = 3 km = KMeans(n_clusters=K) km.fit(newsgroups) labels = km.labels_ print "\nKMeans Label Assignment, K = {}".format(K) for category, label, in zip(categories, labels): print int(label), category simMatrix(newsgroups).plot().show()
def test_select_initial_centroids(self): expected_initial_centroids = [[2, 1], [-1, -2]] k_means = KMeans(num_clusters=self.num_clusters, seed=3) k_means.fit(self.data) initial_centroids = k_means._select_initial_centroids(self.data) self.assertEqual(expected_initial_centroids, initial_centroids) self.assertEqual(self.num_clusters, len(initial_centroids))
def __init_parameters(self): N = self.X.shape[0] n_features = self.X.shape[1] kmeans = KMeans(n_clusters=self.n_components, n_init=5) kmeans.fit(self.X) # mu, means for each component self.means_ = kmeans.cluster_centers_ # sigma, covariances for each component self.covariances_ = np.zeros( [self.n_components, n_features, n_features]) # pi, weights for each component self.weights_ = np.zeros(self.n_components) for k in range(self.n_components): logic = (kmeans.labels_ == k) Nk = logic.sum() # otherwise error if Nk > 1: Xk = self.X[logic] self.covariances_[k] = np.cov(Xk.T) self.weights_[k] = Nk / N # gamma(Znk) self.gamma = np.zeros([N, self.n_components]) # log_likelihood self.lower_bound_ = -np.inf return self
def main(): filepath = "./data/self_test.csv" #filepath = "./data/self_test_petit.csv" #filepath = "./data/iris.csv" # chargement des données data, labels = load_dataset(filepath) # initialisation de l'objet KMeans kmeans = KMeans(n_clusters=3, max_iter=100, early_stopping=True, tol=1e-6, display=True) # calcule les clusters kmeans.fit(data) # calcule la pureté de nos clusters score = kmeans.score(data, labels) print("Pureté : {}".format(score)) input("Press any key to exit...")
def test06_fit_two_clusters(self): np.random.seed(1) model = KMeans(k=2, init=init.forgy_initialization) data = np.array([[-1.0, 0.0], [-1.001, 0.0], [-0.999, 0.0], [0.0, 1.0], [0.0, 0.999], [0.0, 1.001]]) model.fit(data) self.assertEquals(model.predict(data), [1, 1, 1, 0, 0, 0])
def plot_elbow(interval, data, random_seed=None): inertia = [] for n_clusters in interval: clf = KMeans(k=n_clusters, init='kmeans++', random_seed=random_seed) clf.fit(data) inertia.append(clf.inertia) plot_metrics(interval, inertia, 'Elbow method', 'Number of clusters (K)', 'Sum of Squared Error')
def test_fit(): test_model = KMeans(n_clust=2, random_seed=100) test_x_train = np.array([[1, 2], [1, 4.1], [1, 0], [10, 2.1], [10, 4.1], [10, 0]]) test_model.fit(test_x_train) expected_classes = np.array([0, 0, 0, 1, 1, 1]) expected_centers = np.array([[1., 2.0333333], [10., 2.0666667]]) np.testing.assert_array_equal(test_model.classes, expected_classes) np.testing.assert_allclose(test_model.centers, expected_centers)
def test_kmeans(): X = np.random.normal(size=(50, 2)) km = KMeans(nr_clusters=2) km.fit(X) assert km.centroids.shape[0] == 2 distances = [] for centroid in km.centroids: distances.append(km.euclidean_distance_2d(centroid, X[-1:])) assert km.predict(X[-1:])[0] == np.argmin(distances)
def test_fit(self): expected_labels = [0, 0, 0, 1, 1, 1] expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]] expected_inertia = 2.6666667 k_means = KMeans(num_clusters=self.num_clusters, seed=1) k_means.fit(self.data) self.assertEqual(expected_labels, k_means.labels_) np.testing.assert_almost_equal(expected_centroids, k_means.centroids_) self.assertAlmostEqual(expected_inertia, k_means.inertia_)
def initialize(self, data): """ :param data: data, numpy 2-D array """ clf = KMeans(self.K) clf.fit(data, 10) self.centers = clf.get_centers() self.weights = np.ones(self.K) / self.K self.covariances = np.array( [1e10 * np.eye(data.shape[1]) for _ in range(self.K)])
def main(): (X_train, y_train), (X_test, y_test) = tf.contrib.keras.datasets.mnist.load_data() X_train = (X_train).reshape(-1, 28*28) X_test = (X_test).reshape(-1, 28*28) Y_train = tf.contrib.keras.utils.to_categorical(y_train) print("Data Loaded") model = KMeans(k=25, n_features=28*28, n_classes=10) model.fit(X_train, Y_train) print("final testing accuracy: %.4f" % (model.predict(X_test) == y_test).mean())
def test_manhattan_distance(self): expected_labels = [0, 0, 0, 1, 1, 1] expected_centroids = [[-2, -2], [2, 2]] expected_inertia = 4 k_means = KMeans(num_clusters=self.num_clusters, distance_function='manhattan', seed=1) k_means.fit(self.data) self.assertEqual(expected_labels, k_means.labels_) np.testing.assert_almost_equal(expected_centroids, k_means.centroids_) self.assertAlmostEqual(expected_inertia, k_means.inertia_)
def test_gaussian_mixture(self): pos_list, ground_truth = datasets.make_blobs(n_samples=100, centers=[[3, 3], [-3, -3], [3, -3], [-3, 3]], cluster_std=1, random_state=0) kmeans = KMeans(4) standard_kmeans = cluster.KMeans(4, random_state=0) np.random.seed(2020) kmeans.fit(pos_list) standard_kmeans.fit(pos_list) self.assertAlmostEqual(metrics.adjusted_rand_score(kmeans.labels_, ground_truth), 1.0) self.assertAlmostEqual(kmeans.inertia_, standard_kmeans.inertia_)
def test_implementation(self): x = np.array([[0, 0], [0, 1], [4, 0], [4, 1]]) kmeans = KMeans(2) np.random.seed(2020) kmeans.fit(x) self.assertAlmostEqual(kmeans.inertia_, 1.0) self.assertAlmostEqual(metrics.adjusted_rand_score(kmeans.labels_, [0, 0, 1, 1]), 1.0) if np.abs(kmeans.cluster_centers_[0, 0] - 4) < 1e-5: assert_array_almost_equal(kmeans.cluster_centers_, np.array([[4, 0.5], [0, 0.5]])) else: assert_array_almost_equal(kmeans.cluster_centers_, np.array([[0, 0.5], [4, 0.5]]))
def test_fit_predict(self): data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) model = KMeans(2) model.fit(data) labels = model.predict(data) self.assertEqual(len(labels), len(data)) self.assertGreaterEqual(2, len(np.unique(labels))) for label in labels: self.assertTrue( isinstance(label, np.int64) or isinstance(label, np.int32) or isinstance(label, int))
def test05_fit_one_cluster(self): model = KMeans(k=1, init=init.forgy_initialization) data = np.array([[0.0, 0.0]]) model.fit(data) self.assertEqual(model.predict(data), [0]) np_test.assert_array_equal(model.centroids, np.array([[0.0, 0.0]])) test_points = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]]) self.assertEqual(model.predict(test_points), [0] * 3)
def plot_optimal_k(data): plt.figure() sum_squared_errors = [] for n_clusters in range(1, 20): k_means = KMeans(n_clusters=n_clusters, max_iter=100) k_means.fit(data) sse = k_means.sum_squared_error() sum_squared_errors.append(sse) print(sum_squared_errors) plt.plot(range(1, 20), sum_squared_errors) plt.xlabel('# of clusters') plt.ylabel('WCSSE') plt.show()
def initialize(self, data: np.ndarray): """ Initializes cluster centers, weights, and covariances :param data: data, numpy 2-D array """ km = KMeans(self.K) km.fit(data) _ = km.predict(data) self.centers = km.get_centers() self.weights = np.random.uniform(0, 1, (self.K, )) self.weights = self.weights / np.sum(self.weights) self.covariances = np.array([np.eye(data.shape[-1])] * self.K) * 10e8
def train(self, x_train): """Receive the input training data, then learn the model. Parameters ---------- x_train: np.array, shape (num_samples, num_features) Returns ------- None """ self.affinity_matrix_ = self._get_affinity_matrix(x_train) embedding_features = self._get_embedding() kmeans = KMeans(n_clusters=self.n_clusters) kmeans.fit(embedding_features) self.labels_ = kmeans.labels_
def main(args): data = load_data(Path(args.data_csv)) plt.clf() plt.scatter(data[:, 0], data[:, 1], s=3, color='blue') if args.algorithm == 'gmm': gmm = GaussianMixtureModel(args.num_clusters) gmm.fit(data) y = gmm.predict_cluster(data) else: km = KMeans(args.num_clusters) km.fit(data) y = km.predict(data) plt.scatter(data[:, 0], data[:, 1], c=y) plt.show()
def main(): ds = Dataset(config) imgs, annots = ds.open_traffic_ds(config) dp = DataPrepper(x_data=imgs,y_data=annots) dp.x_data_scaled,dp.y_data_scaled = dp.rescale_data(dp.x_data,dp.y_data) km = KMeans(k=args.k,dataset=dp.y_data_scaled) if(args.fit_avg): km.fit_average(max_iterations=args.kmeans_iters) if(args.save_anchors): km.write_anchors(km.centroids) else: km.fit() if(args.save_anchors): km.write_anchors(km.centroids)
def kmeans_toy(): x, y = toy_dataset(4) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=y) fig.savefig('plots/toy_dataset_real_labels.png') fig.ax.scatter(x[:, 0], x[:, 1]) fig.savefig('plots/toy_dataset.png') n_cluster = 4 k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8) centroids, membership, i = k_means.fit(x) assert centroids.shape == (n_cluster, 2), \ ('centroids for toy dataset should be numpy array of size {} X 2' .format(n_cluster)) assert membership.shape == (50 * n_cluster,), \ 'membership for toy dataset should be a vector of size 200' assert type(i) == int and i > 0, \ 'Number of updates for toy datasets should be integer and positive' print('[success] : kmeans clustering done on toy dataset') print('Toy dataset K means clustering converged in {} steps'.format(i)) fig = Figure() fig.ax.scatter(x[:, 0], x[:, 1], c=membership) fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red') fig.savefig('plots/toy_dataset_predicted_labels.png') np.savez('results/k_means_toy.npz', centroids=centroids, step=i, membership=membership, y=y)
def main(args): df = pd.read_csv(args.data_csv) data = np.array(df[['X', 'Y']]) plt.clf() plt.scatter(data[:, 0], data[:, 1], s=3, color='blue') if args.algorithm == 'gmm': gmm = GaussianMixtureModel(args.num_clusters) gmm.fit(data) y = gmm.predict_cluster(data) else: km = KMeans(args.num_clusters) km.fit(data) y = km.predict(data) plt.scatter(data[:, 0], data[:, 1], c=y) plt.show()
def calculate_em(X, n_clusters, diag=False, ridge=1e-10, verbose=False, max_iterations=100): """ Returns mu, sigma and tpi """ n_samples, n_features = X.shape # Initialise the data using kmeans k_means = KMeans(k=n_clusters) k_means_labels, _ = k_means.fit(X.copy()) k_means_cluster_centers = k_means.centers_ # OK, so we've got the centers and the labels. Let's now compute the EM # algorithm tau = np.zeros((n_samples, n_clusters)) mu = np.zeros((n_clusters, n_features)) sigma = np.zeros((n_clusters, n_features, n_features)) p = np.zeros((n_clusters, n_samples)) # FIXME shouldbe able to do the following using pure matric arithmetics for i, element in enumerate(k_means_labels): tau[i, element] = 1 for j in range(max_iterations): old_mu = mu.copy() for i in range(n_clusters): mu[i] = (tau[:, i].reshape((tau.shape[0], 1)) * X).sum(axis=0) / (tau[:, i]).sum() for i in range(n_clusters): a = 0 for n in range(n_samples): b = (X[n, :] - mu[i]).reshape((2, 1)) if diag: a += tau[n, i] * np.dot(b.T, b) else: a += tau[n, i] * np.dot(b, b.T) if diag: sigma[i, :] = a.mean() / tau[:, i].sum() * np.identity(mu.shape[1]) else: sigma[i, :] = a / tau[:, i].sum() tpi = tau.sum(axis=1) / n_samples for i in range(n_clusters): p[i, :] = _calculate_normal(X, mu[i, :], sigma[i, :]) for i in range(n_clusters): tau.T[i, :] = tpi[i] * p[i, :] / (tpi * p).sum(axis=0) if ((old_mu - mu) ** 2).sum() < ridge: if verbose: print "break at iterations %d" % j break return mu, sigma, tpi
iris_data = iris_data.data[:, 1:3] # uzima se druga i treca osobina iz data seta (sirina sepala i duzina petala) plt.figure() for i in range(len(iris_data)): plt.scatter(iris_data[i, 0], iris_data[i, 1]) plt.xlabel('Sepal width') plt.ylabel('Petal length') plt.show() # --- INICIJALIZACIJA I PRIMENA K-MEANS ALGORITMA --- # # TODO 2: K-means na Iris data setu kmeans = KMeans(n_clusters=2, max_iter=100) kmeans.fit(iris_data) colors = {0: 'red', 1: 'green'} plt.figure() for idx, cluster in enumerate(kmeans.clusters): plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200) # iscrtavanje centara for datum in cluster.data: # iscrtavanje tacaka plt.scatter(datum[0], datum[1], c=colors[idx]) plt.xlabel('Sepal width') plt.ylabel('Petal length') plt.show() # --- ODREDJIVANJE OPTIMALNOG K --- #
X = load_data('EMGaussienne.data') X_test = load_data('EMGaussienne.data') n_clusters = 4 num_init = 3 ############################################################################## # Plot result fig = pl.figure() colors = ['#4EACC5', '#FF9C34', '#4E9A06', '#00465F'] for ini in range(num_init): km = KMeans(k=n_clusters) k_means_labels, k_means_inertia = km.fit(X) k_means_cluster_centers = km.centers_ k_means_labels_test, k_means_inertia_test = km.predict(X_test) # KMeans ax = fig.add_subplot(num_init, 2, 2 * ini + 1) for k, col in zip(range(n_clusters), colors): my_members = k_means_labels == k cluster_center = k_means_cluster_centers[k] ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=col, marker='.') ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) ax.set_title('KMeans - inertia %d' % k_means_inertia)
r2, theta2 = np.random.normal(5, 0.25), np.random.uniform(0, 2*np.pi) x2, y2 = r2 * np.cos(theta2), r2 * np.sin(theta2) s2[i] = (x2, y2, r2, theta2) plt.scatter(x1, y1) plt.scatter(x2, y2) data.append((x1, y1)) data.append((x2, y2)) plt.show() # TODO 5: K-means nad ovim podacima kmeans = KMeans(n_clusters=2, max_iter=100) kmeans.fit(data) colors = {0: 'red', 1: 'green'} plt.figure() for idx, cluster in enumerate(kmeans.clusters): plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200) # iscrtavanje centara for datum in cluster.data: # iscrtavanje tacaka plt.scatter(datum[0], datum[1], c=colors[idx]) plt.show() # TODO 7: DBSCAN nad ovim podacima dbscan = DBScan(epsilon=1.2, min_points=3) dbscan.fit(data) colors = {0: 'red', 1: 'pink', 2: 'yellow', 3: 'cyan', 4: 'green', 5: 'blue'}
class ClsfCRTL(object): def __init__(self, n_clusters, initCent, max_iter): self.data = np.array([]) self.belongApp = np.array([]) self.n_clusters = n_clusters self.clf = KMeans(n_clusters, initCent, max_iter) def genDataset(self, file_name): dataSet, belongApp = [], [] f = open(file_name, "r") lines = f.readlines() for line in lines: line_elm = line.split("\t") dataSet.append([int(line_elm[0]), 0]) belongApp.append(line_elm[1].rstrip("\n")) self.data = np.array(dataSet) self.belongApp = np.array(belongApp) f.close() def clsf(self): self.clf.fit(self.data) def show(self): cents = self.clf.centroids labels = self.clf.labels sse = self.clf.sse colors = ['b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868', '#00FF00', '#330000', '#333300', '#333333', '#CC0099', '#FFFF00', '#FF99CC', '#CCCC66', '#003333', '#66FFFF'] for i in range(self.n_clusters): index = np.nonzero(labels==i)[0] x0 = self.data[index, 0] x1 = self.data[index, 1] y_i = self.belongApp[index] for j in range(len(x0)): plt.scatter(x0[j], x1[j], marker='o', color=colors[i]) # plt.text(x0[j],x1[j],str(y_i[j]),color=colors[i],fontdict={'weight': 'bold', 'size': 9}) plt.scatter(cents[i,0], cents[i,1], marker='x', color=colors[i], linewidths=5) plt.title("SSE={:.2f}".format(sse)) plt.axis([0, 1600, -2, 2]) plt.show() def showBar(self): n = 1600 X = np.arange(n) Y1 = (1-X/float(n) * np.random.uniform(0.5, 1.0, n)) rect = plt.bar(X, +Y1, facecolor='#524C90', edgecolor='white') for x,y in zip(X, Y1): plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va='bottom') plt.xlim(-0.5, 12.5) plt.ylim(-0.1, +1.25) plt.xlabel("xlabel") plt.ylabel("ylabel") plt.title("title") plt.legend((rect,), ("example",)) plt.show() def genResFile(self, i): cents = self.clf.centroids sse = self.clf.sse f = open("Res" + "-" + str(i), "w") f.write(str(cents.shape[0]) + '\n') for cent in cents: f.write(str(cent[0]) + '\t' + str(cent[1]) + '\n') f.write(str(sse) + '\n') # test f.write("\n") for clu in self.clf.clusterAssment: f.write(str(clu[0]) + '\t' + str(clu[1]) + '\n') # test f.close()
iris_data = iris_data.data[:, 1:3] # uzima se druga i treca osobina iz data seta (sirina sepala i duzina petala) plt.figure() for i in range(len(iris_data)): plt.scatter(iris_data[i, 0], iris_data[i, 1]) plt.xlabel('Sepal width') plt.ylabel('Petal length') plt.show() # --- INICIJALIZACIJA I PRIMENA K-MEANS ALGORITMA --- # # TODO 2: K-means na Iris data setu kmeans = KMeans(n_clusters=2, max_iter=100) kmeans.fit(iris_data, normalize=True) colors = {0: 'red', 1: 'green'} plt.figure() for idx, cluster in enumerate(kmeans.clusters): plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200) # iscrtavanje centara for datum in cluster.data: # iscrtavanje tacaka plt.scatter(datum[0], datum[1], c=colors[idx]) plt.xlabel('Sepal width') plt.ylabel('Petal length') plt.show() # --- ODREDJIVANJE OPTIMALNOG K --- #
from em import calculate_log_likelihood from kmeans import KMeans n_clusters = 4 X = utils.load_data('EMGaussienne.data') Xtest = utils.load_data('EMGaussienne.test') max_iterations = 150 ridge = 1e-6 verbose = True n_samples, n_features = X.shape # Initialise the data using kmeans k_means = KMeans(k=n_clusters) k_means_labels, _ = k_means.fit(X.copy()) k_means_cluster_centers = k_means.centers_ mu, sigma, tpi = calculate_em(X, n_clusters) print 'Log likelihood %d' % calculate_log_likelihood(X, mu, sigma, tpi) print 'Log likelihood %d' % calculate_log_likelihood(Xtest, mu, sigma, tpi) p = np.zeros((n_clusters, n_samples)) for i in range(n_clusters): p[i, :] = _calculate_normal(X, mu[i, :], sigma[i, :]) em_labels = p.argmax(axis=0) p = np.zeros((n_clusters, n_samples)) for i in range(n_clusters): p[i, :] = _calculate_normal(Xtest, mu[i, :], sigma[i, :])
def fit(self, X): self.X = X self.N = X.shape[0] self.ndim = X.shape[1] np.random.seed(self.random_seed) matX = np.asmatrix(X) # initialization schemes if self.init_method == 'random': if self.init_means is not None: mu = self.init_means else: mu = X[np.random.choice(range(0, len(X)), self.num_gaussians), :] # sample from the data if self.init_cov is not None: sigma = self.init_cov else: sigma = list() for k in range(self.num_gaussians): sigma.append(np.identity(self.ndim, dtype=np.float64)) sigma[k] += np.random.rand(self.ndim, self.ndim) # purely synthetic sigma[k] = np.dot(sigma[k], sigma[k].T) # making it positive semi-definite and symmetric sigma[k] /= sigma[k].sum() # lowerbound = k * self.N / self.num_gaussians # sample from data # upperbound = lowerbound + 20 # sigma[k] = np.cov(X[lowerbound:upperbound, :].T) if self.init_weights is not None: lmbda = self.init_weights else: lmbda = np.random.rand(self.num_gaussians) lmbda /= lmbda.sum() elif self.init_method == 'kmeans': # use means of kmeans as initial means, and calculate cov from the clusters model = KMeans(K=self.num_gaussians, max_iter=5) model.fit(X) labels = model.pred(X) mu = np.zeros((self.num_gaussians, self.ndim)) sigma = [np.zeros((self.ndim, self.ndim))] * self.num_gaussians for k in range(self.num_gaussians): cluster = X[labels == k] mu[k] = cluster.mean(axis=0) sigma[k] = np.cov(cluster.T) if self.init_weights is not None: lmbda = self.init_weights else: lmbda = np.random.rand(self.num_gaussians) lmbda /= lmbda.sum() ######## BEGIN ACTUAL ALGORITHM ################### for iter in range(self.max_iter): phat = np.zeros((self.N, self.num_gaussians)) N = np.zeros(self.num_gaussians) # E step for k in range(0, self.num_gaussians): normal_var = normal(mean=mu[k], cov=sigma[k]) phat[:, k] = lmbda[k] * normal_var.pdf(X) phat /= phat.sum(axis=1)[:, None] # faster to do it all with numpy than use loops # for n in range(0, self.N): # loop over each data point # for k in range(0, self.num_gaussians): # normalx = normal(mean=mu[k], cov=sigma[k]).pdf(X[n, :]) # phat[n, k] = lmbda[k] * normalx # phat[n, :] /= phat[n, :].sum() # M step for k in range(self.num_gaussians): N[k] = phat[:, k].sum() mu[k] = np.dot(phat[:, k], X) / N[k] intermed = np.multiply(phat[:, k], (matX - mu[k]).T).T sigma[k] = np.dot(intermed.T, (matX - mu[k])) / N[k] lmbda[k] = N[k] / self.N pass # end of this iteration self.mu = mu self.sigma = sigma self.lmbda = lmbda
import matplotlib.pyplot as plt import numpy as np from kmeans import KMeans,biKMeans if __name__ == "__main__": #加载数据 X,y = cPickle.load(open('data.pkl','r')) #依次画出迭代1次、2次、3次...的图 for max_iter in range(6): #设置参数 n_clusters = 10 initCent = X[50:60] #将初始质心初始化为X[50:60] #训练模型 clf = KMeans(n_clusters,initCent,max_iter) clf.fit(X) cents = clf.centroids labels = clf.labels sse = clf.sse #画出聚类结果,每一类用一种颜色 colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868'] for i in range(n_clusters): index = np.nonzero(labels==i)[0] x0 = X[index,0] x1 = X[index,1] y_i = y[index] for j in range(len(x0)): plt.text(x0[j],x1[j],str(int(y_i[j])),color=colors[i],\ fontdict={'weight': 'bold', 'size': 9}) plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12) plt.title("SSE={:.2f}".format(sse))