def main(): path = sys.argv[1] csvManager = CSVManager() df = csvManager.read(path) df = csvManager.replaceNan(df) formattedCSV = csvManager.deleteObjectColumns(df) matrix = csvManager.convertCSVToMatrix(formattedCSV) try: for k in range(2, 5): kmeans = KMeans(k) kmeans.fit(matrix) for centroid in kmeans.centroids: plt.scatter(kmeans.centroids[centroid][0], kmeans.centroids[centroid][1], marker="o", color="k", s=150, linewidths=5) for classification in kmeans.classifications: color = randomColor() for featureset in kmeans.classifications[classification]: plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=60, linewidths=2) plt.show() confusionMatrix, purity = kmeans.purity() saveData(confusionMatrix, purity, path, k) except Exception: print("An empty cluster was found, please run the program again. This program does not handle empty clusters")
def __init__(self, cluster_number: int, data: np.ndarray): k_means = KMeans(data=data, cluster_number=cluster_number) k_means.fit() k_means.visualize() self.cluster_number = cluster_number self.data = data self.clusters_means = np.zeros((cluster_number, data.shape[1])) self.clusters_means = k_means.cluster_data_means self.clusters_priors = np.zeros((1, cluster_number)) self.clusters_priors = np.sum(k_means.cluster_assignment_matrix, axis=0) / data.shape[0] self.clusters_covariances = np.zeros((cluster_number, data.shape[1], data.shape[1])) # maybe something wrong with covariance (scalar value) # sigma = 1/ N - 1 not biases estimator for k in range(self.cluster_number): idx = np.nonzero(k_means.cluster_assignment_matrix[:, k])[0] cov_data = (data[idx] - self.clusters_means[k]) self.clusters_covariances[k] = np.dot(cov_data.T, cov_data) / idx.shape[0] self.clustered_data = np.zeros((data.shape[0], data.shape[1] + 1)) self.cluster_probability_matrix = np.zeros((self.data.shape[0], cluster_number))
def fit(self, data): kmeans = KMeans(n_clusters=self.n_clusters) kmeans.fit(data) candidate = [] for k in kmeans.centroids: candidate.append(kmeans.centroids[k]) candidate = np.array(candidate).ravel() self.dim = data.shape[1] self.pso = PSO(dim=self.dim * self.n_clusters, minf=0, maxf=1, swarm_size=self.swarm_size, n_iter=self.n_iter, w=self.w, lb_w=self.lb_w, c1=self.c1, c2=self.c2) self.pso.set_candidate(candidate) self.pso.optimize(self.__objective_function, customizable=True, dim=self.dim, n_clusters=self.n_clusters, data=data) self.centroids = {} raw_centroids = self.pso.global_optimum.pos.reshape( (self.n_clusters, self.dim)) for centroid in range(len(raw_centroids)): self.centroids[centroid] = raw_centroids[centroid]
def main(): parser = argparse.ArgumentParser() parser.add_argument('path', type=str, help="path to dataset") parser.add_argument('--k', type=int, default=3, help="quantity of clusters (default 3)") parser.add_argument('--it', type=int, default=100, help="max iterations (default 100)") parser.add_argument('--tol', type=float, default=0.001, help="tolerance (default 0.001)") args = parser.parse_args() csvManager = CSVManager() df = csvManager.read(args.path) df = csvManager.replaceNan(df) formattedCSV = csvManager.deleteObjectColumns(df) matrix = csvManager.convertCSVToMatrix(formattedCSV) kmeans = KMeans(args.k, args.it, args.tol) kmeans.fit(matrix) for centroid in kmeans.centroids: plt.scatter(kmeans.centroids[centroid][0], kmeans.centroids[centroid][1], marker="o", color="k", s=150, linewidths=5) for classification in kmeans.classifications: color = randomColor() for featureset in kmeans.classifications[classification]: plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=60, linewidths=2) plt.show()
def main(): path = sys.argv[1] csvManager = CSVManager() df = csvManager.read(path) df = csvManager.replaceNan(df) formattedCSV = csvManager.deleteObjectColumns(df) formattedCSV = csvManager.deleteObjectColumns(df) matrix = csvManager.convertCSVToMatrix(formattedCSV) try: with open('result/result.txt', 'w') as file: res = '' for k in range(2, 5): kmeans = KMeans(k) kmeans.fit(matrix) simplifiedSilhouette = SimplifiedSilhouette( formattedCSV, kmeans) sswc = simplifiedSilhouette.calculate() res += 'K = ' + str(k) + '; ' + 'SSWC = ' + str(sswc) + '\n' file.write(res) except Exception: print("An empty cluster was found, please run the program again. This program does not handle empty clusters")
def treeClassification(data): # pca = PCA(n_components=2) # pca_data = pca.fit_transform(data) km = KMeans(n_clusters=6, max_iter=200) km.fit(data.values, True) # km = KMeans(n_clusters=6) # clusters = km.fit_predict(data) cluster_report(data, km.prediction)
def attr_analysis(data): km = KMeans(n_clusters=6, max_iter=200) km.fit(data.values, True) for cluster in km.clusters: for i in range(len(cluster.data[0])): col = _column(cluster.data, i) ax = plt.subplot(3, 6, i + 1) ax.set_title(data.columns[i], {'fontsize': 6}) plt.boxplot(col) plt.show()
def findClustering(self, cluster): # Use KMeans to form 2 clusters kmeans = KMeans() kmeans.setK(2) kmeans.fit(cluster) predictedClass = kmeans.predict(cluster) centroids = kmeans.centroids # Compute SSE for the clustering sum = 0 for clusterIndex in range(len(centroids)): for element in cluster: sum += (np.linalg.norm( np.array(element) - np.array(centroids[clusterIndex])))**2 return sum, predictedClass
def sse_plot(X, start=2, stop=20): inertia = [] for x in range(start, stop): print("====ITERATION:", x) km = KMeans(n_clusters=x, max_iter=1000) km.fit(X, True) inertia.append(km.sum_squared_error()) plt.figure(figsize=(12, 6)) plt.plot(range(start, stop), inertia, marker='o') plt.xlabel('Number of Clusters') plt.ylabel('SSE') plt.title('Inertia plot with K') plt.xticks(list(range(start, stop))) plt.show()
def initialize_(self, X): n, p = X.shape # kmeans initialization if self.initialization_ == 'kmeans': kmeans_clstr = KMeans(nr_clusters=self.k_, n_init=1) kmeans_clstr.fit(X) labels = kmeans_clstr.labels_ self.cond_prob_ = np.zeros((n, self.k_)) for i in range(n): j = int(labels[i]) self.cond_prob_[i, j] = 1 # else randomly initialize them else: foo = np.random.rand(n, self.k_) self.cond_prob_ = foo / np.sum(foo, axis=1)[:, np.newaxis]
def main(): #load data X = handle_data('data2.txt') km = KMeans(5) km.fit(X) #Plotting colors = 10 * [ 'gold', 'mediumseagreen', 'orangered', 'lightpink', 'coral', 'mediumslateblue', 'violet', 'magenta' ] plt.figure(figsize=(10, 10)) #plotting each feature by using corresponding color for classification in km.classes: color = colors[classification] #features for features in km.classes[classification]: plt.scatter(features[0], features[1], color=color, s=10) #plt.scatter(np.mean(features[0]), np.mean(features[1]), marker='*', c = 'k',s = 150) #Centroid centers for centroid in km.centroids: plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], c='k', s=100, marker="x") #random inital points for l in range(km.k): plt.scatter(km.randoms[l][0], km.randoms[l][1], marker='*', c='k', s=100) #plot attributes plt.legend(['* = Initial random points', 'X = Final cluster centers']) plt.xlabel('x1') plt.ylabel('x2') plt.title('k-Means') plt.show() print('\t\t\tIteration:', km.iterations) print('\n\t\t\tk value: ', km.k)
def main(): random_seed = 0 iteration = 50 init_method = 'kmeans++' X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=random_seed) plt.scatter(X[:, 0], X[:, 1], s=4, c='blue') kmeans = KMeans() #kmeans.fit_range(X, list(range(3, 7)), random_seed=random_seed, iteration=iteration, init_method=init_method) kmeans.fit(X, 4, random_seed=random_seed, iteration=iteration, init_method=init_method) y_pred = kmeans.predict(X) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(X[:, 0], X[:, 1], c=y_pred, s=4, cmap='viridis') centers = kmeans.centroids ax.scatter(centers[:, 0], centers[:, 1], c='red', s=15, alpha=0.5) plt.show()
def visualization_2d(data): # reduce dimesions of dataset based on data variance (PCA) pca = PCA(n_components=2) pca_data = pca.fit_transform(data) # Do KMeans for PCA data n_clusters(6 or 7) km = KMeans(n_clusters=6, max_iter=200) km.fit(pca_data, True) colors = ['red', 'green', 'blue', 'purple', 'orange', 'yellow', 'gray'] for i in range(len(km.clusters)): pc1 = [] pc2 = [] for row in km.clusters[i].data: pc1.append(row[0]) pc2.append(row[1]) plt.scatter(pc1, pc2, c=colors[i], label='cluster ' + str(i)) plt.show()
def main(): km = KMeans(3) iris = pd.read_csv("iris.csv") data = np.array( iris[["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]].values.tolist() ) km.fit(data) print("cluster centers: %s" % km.cluster_centers) for d in iris.values: prediction = km.predict([[ d[2], d[2], d[3], d[4] ]]) print(d[5]+" - "+str(prediction[0]))
def main(): dim = 2 num_class = 3 dataset_dir = '../input/wine.csv' train_x, train_y, raw_data = data_loader(dataset_dir) pca = PCA(first_k=dim, use_threshold=False, threshold=0.5) proj = pca.fit(train_x) kmeans = KMeans(K=num_class) center, predict_y = kmeans.fit(proj) result = evaluate(proj, train_y, predict_y, k=num_class) visualization(center, proj, predict_y, dim) save_to_csv(raw_data, predict_y) print(result)
def exploratory_analysis(data): best_columns = [ "BALANCE", "PURCHASES", "CASH_ADVANCE", "CREDIT_LIMIT", "PAYMENTS", "MINIMUM_PAYMENTS", "PRC_FULL_PAYMENT" ] # data with best col best_data = pd.DataFrame(data[best_columns]) km = KMeans(n_clusters=6, max_iter=200) km.fit(best_data.values, True) best_data['cluster'] = km.prediction best_columns.append('cluster') sb.pairplot(best_data[best_columns], hue='cluster', x_vars=best_columns, y_vars=best_columns, height=5, aspect=1) sb.pairplot(best_data[best_columns], hue='cluster', x_vars=best_columns[0:4], y_vars='cluster', height=5, aspect=1) sb.pairplot(best_data[best_columns], hue='cluster', x_vars=best_columns[4:7], y_vars='cluster', height=5, aspect=1) plt.show()
def fit_predict(self, X): """ 使用数据集,训练一个谱聚类模型,并且对数据集进行聚类 Parameters ---------- X : class:`ndarray<numpy.ndarray>` of shape (N,M) 训练集中一共有N个数据,每个数据集具有M个属性 """ if self.affinity == "full_link": w = self.full_link(X, dist=self.rbf) elif self.affinity == "nearest_neighbors": w = self.knn_nearest(X) norm_laplacians = self.laplacians_matrix(w) eigval, eigvec = np.linalg.eig(norm_laplacians) ix = np.argsort(eigval)[0:self.n_clusters] H = eigvec[:, ix] kmeans = KMeans(n_clusters=self.n_clusters) kmeans.fit(H) pred = kmeans.predict(H) return pred
def main(): logging.basicConfig(filename="result/log.txt", filemode='w', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('-n_clusters', type=int, default=5) parser.add_argument('-n_points', type=int, default=100) opt = parser.parse_args() tester = Tester(n_gaussian_clusters=opt.n_clusters) # Generate data from n 2d multivariate gaussian parameters data, labels = tester.generate_2d_gaussian_points( how_many_per_each_gaussian=opt.n_points) logging.info(" Generated {} data points from {} different 2 dimensional " "multivariate gaussian distributions. ({} data points for " "each cluster.)".format(opt.n_clusters * opt.n_points, opt.n_clusters, opt.n_points)) # Raw Data utils.draw(data, labels, without_label_color=True, means=None, title="Data", save="result/raw.png", show=False) utils.draw(data, labels, without_label_color=False, means=tester.means, title="Gaussian", save="result/gaussian.png", show=False) # KMeans Prediction kmeans = KMeans(n_cluster=opt.n_clusters) prediction_lables, prediction_centers = kmeans.fit(data) utils.draw(data, prediction_lables, without_label_color=False, means=prediction_centers, title="KMeans", save="result/kmeans.png", show=False) # Concatenate results png_list = ["result/raw.png", "result/gaussian.png", "result/kmeans.png"] utils.concatenate_pngs(png_list, "result/final.png")
image_row = kmeans.BoVW(means, img_vec).reshape((1, 32)) image = np.concatenate((image, image_row), axis = 0) print(f1) np.savetxt('image_data.txt', image) image = np.loadtxt('image_data.txt') """ img = cv2.imread('31.png') print(img) img_obj = ImageHandler(img) x = img_obj.ToShiftedPatches() x = np.array(x) np.savetxt('cell_1.txt', x) kmeans_obj = KMeans(3, x) kmeans_obj.fit(3, 0.002) means = kmeans_obj.mean_vec cov_mat_list = kmeans_obj.CovMatrix() mixture_coeff = kmeans_obj.MixtureCoeff() print(cov_mat_list) """from sklearn.cluster import KMeans obj = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 100, n_init = 10, random_state = 0) y_Kmeans = obj.fit_predict(x) print(obj.cluster_centers_[:])""" GMM_obj = GMM(3, x, means, cov_mat_list, mixture_coeff) GMM_obj.fit(0.0002)
import numpy as np from scipy import io ############# FILE STUFF ############# trainFileMNIST = "./mnist_data/images.mat" trainMatrix = io.loadmat(trainFileMNIST) # Dictionary ############# GET DATA ############# print 20 * "#", "Getting Data", 20 * "#" imageData = np.array(trainMatrix['images']) imageData = np.rollaxis(imageData, 2, 0) # move the index axis to be the first dataShape = np.shape(imageData) print "Image Data Shape", dataShape imageDataFlat = [] for elem in imageData: imageDataFlat.append(elem.flatten()) dataShape = np.shape(imageDataFlat) print "Image Data Flat Shape", dataShape num_clusters = [5, 10, 20] for cluster in num_clusters: print 20 * "#", "Num Clusters:", cluster, 20 * "#" KM = KMeans(cluster, max_iter=10) KM.fit(imageDataFlat) visualize(KM.cluster_centers_, cluster)
display_clusters(y, "Настоящие метки") def display_metrics(n_clusters, metrics, title): plt.figure(figsize=(8, 6)) plt.grid(linestyle='--') plt.plot(n_clusters, metrics, linestyle='-', marker='.', color='r') plt.title(title) plt.xlabel("Количество кластеров") plt.ylabel("Значение метрики") plt.show() external_metrics = [] internal_metrics = [] for i in range(1, 11): kMean = KMeans(k=i) centroids = kMean.fit(X_norm) y_pred = kMean.predict(X_norm) if i == 1: internal_metrics.append(0.0) else: internal_metrics.append(silhouette(X_norm, y_pred, centroids)) external_metrics.append(adjusted_rand_index(y, y_pred)) display_clusters(y_pred, str(i) + ' кластеров') display_metrics(range(1, 11), external_metrics, 'Внешняя метрика') display_metrics(range(1, 11), internal_metrics, 'Внутренняя метрика')
from KMeans import KMeans from sklearn.cluster import KMeans as km import numpy as np from sklearn.datasets import make_blobs import matplotlib.pyplot as plt import time reduced_data, check = make_blobs(n_samples=1000, n_features=2, centers=3, cluster_std=7) start_time = time.time() kmeans = KMeans(n_cluster=3, total_iter=300) kmeans.fit(reduced_data) pred = kmeans.predict(reduced_data) print(time.time() - start_time) plt.figure(1) for i in range(0, len(pred)): if pred[i] == 0: plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="b", alpha=.5) if pred[i] == 1: plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="g", alpha=.5) if pred[i] == 2: plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="y", alpha=.5) plt.scatter(kmeans.centroids[0, 0], kmeans.centroids[0, 1], marker="*", c="r") #wrong kmenas good plots plt.scatter(kmeans.centroids[1, 0], kmeans.centroids[1, 1], marker="*", c="r") plt.scatter(kmeans.centroids[2, 0], kmeans.centroids[2, 1], marker="*", c="r") start_time = time.time() kmeans = km(n_clusters=3) kmeans.fit(reduced_data) pred = kmeans.predict(reduced_data)
# Set this to what attribute list you want to use if preset == "Regular": attributeList = bigAttributes elif preset == "Wings": attributeList = wingAttributes X = dataBase.makePlayersList(attributeList) print("Working with", len(X), "players with", len(attributeList), "attributes each.") kX = X.copy() bkX = X.copy() # Clustering with KMeans kmeans = KMeans() kmeans.setK(clusters) kmeans.fit(kX) pred = kmeans.predict(kX) # Make plot for KMeans # Convert data points to 2D points for plotting pca = PCA(n_components=2) kX = pca.fit_transform(kX) # Make labels based on player position """ labels for regular: 0 - goalkeepers 1 - defenders 2 - midfielders 3 - forwards
from scipy import io ############# FILE STUFF ############# trainFileMNIST = "./mnist_data/images.mat" trainMatrix = io.loadmat(trainFileMNIST) # Dictionary ############# GET DATA ############# print 20 * "#", "Getting Data", 20 * "#" imageData = np.array(trainMatrix['images']) imageData = np.rollaxis(imageData, 2, 0) # move the index axis to be the first dataShape = np.shape(imageData) print "Image Data Shape", dataShape imageDataFlat = [] for elem in imageData: imageDataFlat.append(elem.flatten()) dataShape = np.shape(imageDataFlat) print "Image Data Flat Shape", dataShape num_clusters = [5, 10, 20] for cluster in num_clusters: print 20 * "#", "Num Clusters:", cluster, 20 * "#" KM = KMeans(cluster, max_iter=10) KM.fit(imageDataFlat) visualize(KM.cluster_centers_, cluster)
def k_means(self, data, k=3): ''' Runs k means algorithm on data, and returns the clustered data ''' kmeans = KMeans(k) kmeans.fit(data) return kmeans.classification_names
from KMeans import KMeans import matplotlib.pyplot as plt from sklearn.datasets import make_blobs import numpy as np X, y = make_blobs(n_samples=1000, n_features=2, centers=3, center_box=(-15, 15)) kmeans = KMeans(n_clusters=3) kmeans.fit(X) prediction = kmeans.predict(X) loss = kmeans.loss plt.figure(1) plt.plot(range(len(loss)), loss) plt.figure(2) plt.scatter(X[:, 0], X[:, 1], c=prediction) plt.figure(3) test = np.random.uniform(-15, 15, size=(5000, 2)) test_prediction = kmeans.predict(test) plt.scatter(test[:, 0], test[:, 1], c=test_prediction) plt.show()
# Name of the txt files file1 = 'data1.txt' file2 = 'data2.txt' file3 = 'data3.txt' # load datasets from files dataset1 = pd.read_csv(file1, sep=',', header=None) dataset2 = pd.read_csv(file2, sep=',', header=None) dataset3 = pd.read_csv(file3, sep=',', header=None) "===================== K-Means for Dataset1: k=3, k=7 =====================" kmeans1 = KMeans(n_cluster=3, random_state=721) kmeans1.fit(dataset1) kmeans1.save_figures(outpaths.outpath1) kmeans1.create_gif(outpaths.outpath1) kmeans2 = KMeans(n_cluster=7, random_state=721) kmeans2.fit(dataset1) kmeans2.save_figures(outpaths.outpath2) kmeans2.create_gif(outpaths.outpath2) "===================== K-Means for Dataset2: k=2, k=5 =====================" kmeans3 = KMeans(n_cluster=2, random_state=721) kmeans3.fit(dataset2) kmeans3.save_figures(outpaths.outpath3) kmeans3.create_gif(outpaths.outpath3) kmeans4 = KMeans(n_cluster=5, random_state=721)
label_features = LabelFeatures( labelled_dataset_path=labelled_dataset_path, unlabelled_dataset_path=unlabelled_dataset_path, feature_name='SIFT', decomposition_name='') label_features.set_features() dorsal_features = label_features.get_label_features('dorsal') palmar_features = label_features.get_label_features('palmar') unlabelled_features = label_features.get_unlabelled_images_decomposed_features( ) print('Computing clusters associated with dorsal-hand images...') temp_dictionary = list(dorsal_features.items()) np.random.seed(23) np.random.shuffle(temp_dictionary) dorsal_features = dict(temp_dictionary) kmeans.fit(dorsal_features) # Visualizing dorsal image clusters dorsal_image_cluster_map = kmeans.get_image_cluster_map() dorsal_cluster_visualization = VisualizeClusters(dorsal_features, dorsal_image_cluster_map, 'dorsal') dorsal_cluster_visualization.plot() similarity_val1 = kmeans.get_similarity_val( labelled_dataset_features=dorsal_features, unlabelled_dataset_features=unlabelled_features) print('Computing clusters associated with palmar-hand images...') temp_dictionary = list(palmar_features.items()) np.random.shuffle(temp_dictionary)
import pandas as pd from KMeans import KMeans import matplotlib.pyplot as plt import numpy as np from scipy.io import loadmat mat = loadmat('DataSets/KMeans_PCA/ex7data2.mat') data = mat['X'] model = KMeans() model.fit(data) colors = ["r", "g", "c"] for classification in model.classes: color = colors[classification] for sample in model.classes[classification]: plt.scatter(sample[0], sample[1], color = color,s = 30) for centroid in model.centroids: plt.scatter(centroid[0], centroid[1], s = 130, marker = "x", color='black') plt.show()
sk_agglo_accuracy_complete = 0 sk_agglo_accuracy_average = 0 dbscan_accuracy = 0 sk_dbscan_accuracy = 0 print ('=== ACCURACY FROM PREDICT ===') print () k = 0 for train_index, test_index in kf.split(X, y): print (str(k) + '-fold') X_train, y_train = X.iloc[train_index], y.iloc[train_index] X_test, y_test = X.iloc[test_index], y.iloc[test_index] # KMeans kmeans.fit(np.asarray(X_train)) result = kmeans.predict(np.asarray(X_test)) accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(result)) kmeans_accuracy += accuracy print ('KMeans') print ('Accuracy\t', accuracy) print ('Format {Real class : cluster}') print ('Dict\t\t', str(dict)) print () sk_kmeans.fit(X_train) sk_result = sk_kmeans.predict(X_test) accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(sk_result)) sk_kmeans_accuracy += accuracy print ('Sklearn KMeans') print ('Accuracy\t', accuracy)
data /= std return data.values if __name__ == "__main__": ####################################### # KMEANS ####################################### X = load_kmeans_data() # instantiate KMeans class k_means = KMeans(K_clusters=4, threshold=0.001, n_iters=1000, initialization="forgy") # kmeans training k_means.fit(X) k_means.plot_training_history(X) ####################################### # Simple Linear Regression ####################################### # 1 - chargement du dataset data = pd.read_table(os.path.join(DATA_PATH, "data.txt"), sep="\t", header=None) x = np.array(data[0]) y = np.array(data[1]) # 2 - regression descente de gradient lin_reg_grad = LinReg(method="gradient_descent") # train liner regression model gradient descent lin_reg_grad.fit(x, y) print("linear_regression", lin_reg_grad.coefs)
class GMM(): def __init__(self, initializer='support', cov_type='full'): assert initializer in [ 'support', 'uniform' ], 'Please select initialization scheme as support or uniform' assert cov_type in [ 'full', 'tied', 'diag', 'spherical' ], 'Please select covariance type as full, tied, diag, or spherical' self.kmeans_cls_ = KMeans() self.means_ = None self.cov_ = None self.mixture_weights_ = None self.membership_weights_ = None self.k_ = None self.ll_graph_ = [] self.initializer_ = initializer self.cov_type_ = cov_type def fit(self, X, k, tol_=1e-6): self.k_ = k self.initialize(X) new_ll = self.get_log_likelihood(X) old_ll = new_ll - tol_ * 10 while old_ll - new_ll < -tol_: self.ll_graph_.append(new_ll) self.gaussian_probabilities_multiple(X, normalized=True) self.update_mixture_weights() self.update_means(X) self.update_var(X) old_ll = new_ll new_ll = self.get_log_likelihood(X) def get_cov_from_init(self, X, predictions): if self.cov_type_ == 'full': return np.array( [np.cov(X[predictions == k].T) for k in range(self.k_)]) if self.cov_type_ == 'tied': pass if self.cov_type_ == 'diag': return np.array([ (1 / (len(X[predictions == k]) - 1)) * np.einsum('ij->j', (X[predictions == k] - self.means_[k])**2) for k in range(self.k_) ]) if self.cov_type_ == 'spherical': return np.repeat(np.mean(np.array([ (1 / (len(X[predictions == k]) - 1)) * np.einsum('ij->j', (X[predictions == k] - self.means_[k])**2) for k in range(self.k_) ]), axis=-1)[:, np.newaxis], X.shape[-1], axis=1) def initialize(self, X): self.kmeans_cls_.fit(X, self.k_) predictions_ = self.kmeans_cls_.predict(X) self.means_ = self.kmeans_cls_.means_ self.cov_ = self.get_cov_from_init(X, predictions_) if self.initializer_ == 'support': self.mixture_weights_ = np.array([ sum(predictions_ == k) / len(predictions_) for k in range(self.k_) ]) if self.initializer_ == 'uniform': self.mixture_weights_ = (np.array([1] * self.k_)) / self.k_ def gaussian_probabilities_multiple(self, X, normalized=True): d = X.shape[-1] input_ = X[:, None, :] if self.cov_type_ in ['full', 'tied']: exp_part = -0.5 * np.einsum( 'ijk,jkl,ijl->ij', input_ - self.means_, np.array(list(map(np.linalg.inv, self.cov_))), input_ - self.means_) output = (1 / ((2 * np.pi)**(d / 2) * np.array( list(map(lambda x: np.linalg.det(x)** (1 / 2), self.cov_)))))[None, :] * np.exp(exp_part) elif self.cov_type_ in ['diag', 'spherical']: exp_part = -0.5 * np.einsum('ijk,jk->ij', (input_ - self.means_[None, :, :])**2, 1 / self.cov_) output = (1 / ((2 * np.pi)**(d / 2) * np.prod(self.cov_, axis=1)** (1 / 2)))[None, :] * np.exp(exp_part) if normalized: output = np.einsum('ij,j->ij', output, self.mixture_weights_) output = output / np.sum(output, axis=1, keepdims=True) self.membership_weights_ = output else: return output def update_mixture_weights(self): self.mixture_weights_ = np.einsum( 'ij->j', self.membership_weights_) / self.membership_weights_.shape[0] def update_means(self, X): self.means_ = np.einsum('id,ik->kd', X, self.membership_weights_) / np.einsum( 'ik->k', self.membership_weights_)[:, None] def update_var(self, X): input_ = X[:, None, :] if self.cov_type_ in ['full', 'tied']: self.cov_ = np.einsum( 'ij,ijk,ijl->jlk', self.membership_weights_, input_ - self.means_, input_ - self.means_) / np.einsum( 'ik->k', self.membership_weights_)[:, None, None] elif self.cov_type_ in ['diag', 'spherical']: self.cov_ = (np.einsum('ij,ilk,ilk->jk', self.membership_weights_, input_, input_) - 2 * np.einsum('ij,ilk,jk->jk', self.membership_weights_, input_, self.means_)) / np.einsum( 'ik->k', self.membership_weights_ )[:, None] + self.means_**2 def get_log_likelihood(self, X): output = self.gaussian_probabilities_multiple(X, normalized=False) output = np.log(np.einsum('ij,j->i', output, self.mixture_weights_)) return np.einsum('i->', output)
[1.9, 3], [1, 2.7], [1.9, 3], [1, 2.7], [1.9, 2.4], [0.8, 2], [1.6, 1.8], [1, 1] ] print("\n** Exercise 1 Dataset**") km = KMeans(3, data_t) km.fit() print("Centroids: \n") km.print_centroids() plt.scatter(x=[v[0] for v in km.centroids], y=[v[1] for v in km.centroids], c=['red', 'blue', 'green']) plt.scatter(x=[d[0] for d in data_t], y=[d[1] for d in data_t]) plt.title("K-Means Exercise 1") ####### print("\n ** Iris Dataset **") data_iris = datasets.load_iris(return_X_y=True)[0] print("\n K=3")