def kkmeans(): # 20孩子,每个孩子10天数据 raw_data = '[[1, 8, 20, 19, 4, 8, 3, 0, 0, 1], [18, 18, 1, 19, 4, 13, 6, 19, 4, 1], [4, 7, 10, 3, 6, 4, 10, 20, 11, 10], [12, 5, 4, 14, 7, 10, 16, 2, 9, 17], [7, 19, 14, 17, 11, 15, 19, 6, 8, 6], [17, 7, 3, 5, 7, 20, 1, 16, 13, 3], [19, 11, 10, 0, 17, 2, 14, 15, 5, 6], [4, 14, 18, 9, 19, 19, 1, 18, 20, 7], [20, 15, 8, 3, 12, 1, 12, 6, 0, 10], [18, 16, 17, 6, 0, 9, 9, 11, 2, 8], [2, 2, 9, 3, 19, 18, 1, 16, 9, 20], [15, 15, 13, 19, 11, 7, 20, 8, 14, 6], [1, 20, 1, 17, 4, 3, 13, 4, 2, 18], [0, 16, 18, 20, 16, 14, 8, 20, 5, 14], [11, 1, 7, 17, 17, 11, 10, 14, 6, 16], [8, 12, 15, 8, 5, 18, 19, 1, 13, 4], [17, 20, 13, 9, 11, 0, 16, 8, 16, 15], [3, 2, 12, 8, 8, 5, 7, 8, 20, 3], [20, 2, 2, 13, 4, 20, 0, 4, 14, 11], [20, 3, 12, 9, 14, 18, 17, 7, 5, 7]]' data = eval(raw_data) kmeans = Kmeans(data, 3) kmeans.get_k_rand() kmeans.compare_to_k() kmeans.get_k_avarage() cl = kmeans.compare_to_k2() while True: if len(cl[0]["data"]["rows"]) < 4 or len( cl[1]["data"]["rows"]) < 4 or len(cl[2]["data"]["rows"]) < 4: kmeans = Kmeans(data, 3) kmeans.get_k_rand() kmeans.compare_to_k() kmeans.get_k_avarage() cl = kmeans.compare_to_k2() data1 = kmeans_helper(cl[0]) data2 = kmeans_helper(cl[1]) data3 = kmeans_helper(cl[2]) data_all = kmeans_helper(cl[0] + cl[1] + cl[2]) pprint.pprint(data1) pprint.pprint(data2) pprint.pprint(data3) pprint.pprint(data_all)
def initialization(self): # init basic matrices self.W = np.zeros((self._data_dimension, self._num_bases)) self.H = np.zeros((self._num_bases, self._num_samples)) self.G = np.zeros((self._num_samples, self._num_bases)) ##### # initialize using k-means km = Kmeans(self.data[:, :], num_bases=self._num_bases, show_progress=self._show_progress) km.initialization() km.factorize() assign = km.assigned num_i = np.zeros(self._num_bases) for i in range(self._num_bases): num_i[i] = len(np.where(assign == i)[0]) self.G[range(len(assign)), assign] = 1.0 self.G += 0.01 self.G /= np.tile(np.reshape(num_i[assign], (-1, 1)), self.G.shape[1]) self.H.T[range(len(assign)), assign] = 1.0 self.H += 0.2 * np.ones((self._num_bases, self._num_samples)) self.W = np.dot(self.data[:, :], self.G)
def init_h(self): if not hasattr(self, 'H'): # init basic matrices self.H = np.zeros((self._num_bases, self._num_samples)) # initialize using k-means km = Kmeans(self.data[:, :], num_bases=self._num_bases, seed=self.seed) km.factorize(niter=10) assign = km.assigned num_i = np.zeros(self._num_bases) for i in range(self._num_bases): num_i[i] = len(np.where(assign == i)[0]) self.H.T[range(len(assign)), assign] = 1.0 self.H += 0.2 * np.ones((self._num_bases, self._num_samples)) if not hasattr(self, 'G'): self.G = np.zeros((self._num_samples, self._num_bases)) self.G[range(len(assign)), assign] = 1.0 self.G += 0.01 self.G /= np.tile(np.reshape(num_i[assign], (-1, 1)), self.G.shape[1]) if not hasattr(self, 'W'): self.W = np.dot(self.data[:, :], self.G)
def create_splits(self, X): # get shape of dataset N, D = X.shape # thresholds is set of K-Means of each feature self.thresholds = [] for d in range(D): # reshape (n,) to (n,1) feature = X[:, d] feature = np.reshape(feature, [feature.size, 1]) # Initialize K-Means model k_means = Kmeans(k=k) min_err = np.inf min_err_means = None for i in range(50): k_means.fit(feature) error = k_means.error(feature) if error < min_err: min_err = error min_err_means = k_means.means self.thresholds.append(min_err_means)
def select_next(iterval): """ select the next best data sample using robust map or simply the max iterval ... """ if self._robust_map: k = np.argsort(iterval)[::-1] d_sub = self.data[:, k[:self._robust_nselect]] self.sub.extend(k[:self._robust_nselect]) # cluster d_sub kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster) kmeans_mdl.factorize(niter=10) # get largest cluster h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster + 1))[0] largest_cluster = np.argmax(h) sel = pdist( kmeans_mdl.W[:, largest_cluster:largest_cluster + 1], d_sub) sel = k[np.argmin(sel)] else: sel = np.argmax(iterval) return sel
def main(): # 1.读取数据 dataDF = getDF() # 2.测试最佳K值,第一次出现明显拐角处便是最佳K值 km = Kmeans() km.searchK(SAVAPATH,dataDF,2,12) # 查看保存的图片,选择最佳K值
def quantize(self, img): """ Quantizes an image into 2^b clusters Parameters ---------- img : a (H,W,3) numpy array Returns ------- quantized_img : a (H,W,1) numpy array containing cluster indices Stores ------ colours : a (2^b, 3) numpy array, each row is a colour """ H, W, _ = img.shape pixels = img.reshape((-1, 3)) model = Kmeans(2**self.b) model.fit(pixels) quantized_img = model.predict(pixels).reshape((H, W, 1)) self.colours = model.means return quantized_img
def main(): # Reading the training data path_train = './data/EMGaussian.data' path_test = './data/EMGaussian.test' data = dp.parse_data_wo_labels(path_train, 2, delimiter=' ') data_test = dp.parse_data_wo_labels(path_test, 2, delimiter=' ') # Initialization with K-means best_kmean_model = None min_distortion = float("inf") distortions = [] for i in xrange(NB_INITIALIZATION_RETRIES): kmean_model = Kmeans(data, NB_CLUSTERS, MAX_K_MEAN_ITER) kmean_model.run() distortions.append(kmean_model.distortion) if kmean_model.distortion < min_distortion: best_kmean_model = kmean_model min_distortion = kmean_model.distortion # Showing the distortions plt.plot(range(1, NB_INITIALIZATION_RETRIES + 1), distortions) plt.xlabel("Initialization number") plt.ylabel("Distortion") plt.title("Running few Kmeans and measuring the distortion for each") plt.show() # Plotting the result best_kmean_model.plot() # Case where the covariance matrix is proportional to identity run_em_model(data, data_test, best_kmean_model, sigma_prop_identity=True) # General Case run_em_model(data, data_test, best_kmean_model, sigma_prop_identity=False)
def kmeansselect(self): kmeans_mdl = Kmeans(self.data, num_bases=self._nsub) kmeans_mdl.initialization() kmeans_mdl.factorize() # pick data samples closest to the centres idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W) return idx
def quantize(self, img): b = self.b C, R, D = img.shape self.img = img X = np.reshape(img, (C * R, D)) model = Kmeans(k=pow(2, b)) model.fit(X) self.model = model return model.means
def main(): dataset1 = np.genfromtxt(r'../data/new_dataset_1.txt', dtype=float, delimiter='\t') dataset2 = np.genfromtxt(r'../data/cho.txt', dtype=float, delimiter='\t') km1 = Kmeans(dataset1[:, 2:], dataset1[:, 1], 3) km2 = Kmeans(dataset2[:, 2:], dataset2[:, 1], 10) ic1 = km1.initial_centroids(3, 5, 9) #ic1 = km1.initial_random_centroids(5) ic2 = km2.initial_random_centroids(5) # km1.centroids = km1.init_centroids = np.loadtxt(r'../log/cho_ground_centroids.txt') # specify iteration as parameter here km1.kmeans_algorithm() km2.kmeans_algorithm() extr_index_validation1 = ExternalIndex(km1.ground_truth_clusters, km1.clusters) extr_index_validation2 = ExternalIndex(km2.ground_truth_clusters, km2.clusters) print('Rand Index of dataset1 clusters :', extr_index_validation1.rand_index()) print('Jaccard Coefficient of dataset1 clusters :', extr_index_validation1.jaccard_coefficient()) print('Rand Index of dataset2 clusters :', extr_index_validation2.rand_index()) print('Jaccard Coefficient of dataset2 dataset clusters :', extr_index_validation2.jaccard_coefficient()) plot1 = Visualization(dataset1.data[:, 2:], km1.clusters, dataset1.data[:, 1]) plot2 = Visualization(dataset2.data[:, 2:], km2.clusters, dataset2.data[:, 1]) plot1.plot(r'../log/td1.jpg') plot2.plot(r'../log/cho2.jpg') # gene_cluster_matched = km1.cluster_validation() # print('Genes that matched in clusters: ', gene_cluster_matched) return
def main(): km = Kmeans(tc.init_board_gauss(nb_points, nb_classe, mini, maxi, ecart_min, ecart_max), nb_cluster=nb_cluster, cpu=cpu, methode_dist=methode_dist, adr=img_dir) km.run_global(choose_nb_graph=True, grphq=True) km.save(km_path) print("\n{}".format(km)) return None
def _init_kmeans(self): """ Initialize using k-means. Uses random intialization for k-means. This is a really bad idea. """ data = self.data k = self.k # Estimate the means of the mixture components, using k-means km = Kmeans(data, k) return km.cluster.T, km.label
def quantize(self, X): N, D, C = X.shape X_reshaped = np.reshape(X, (N * D, C)) print(X_reshaped) model = Kmeans(np.power(2, self.b)) model.fit(X_reshaped) model.predict(X_reshaped) y = np.reshape(model.predict(X_reshaped), (N, D)) self.means = model.means self.y = y self.X = X
def main(_argv): probki_string, nazwy_atr, czy_atr_symb = wczytaj_baze_probek_z_tekstem( 'spirala.txt', 'spirala-type.txt') probki = probki_str_na_liczby(probki_string, (0, 1)) grupy, osrodki = Kmeans(probki, FLAGS.groups, FLAGS.iterations, progress) fig = plt.figure(1) anim = animation.FuncAnimation(fig, Animate, frames=len(progress), repeat=False, interval=500) chart.show()
def create_splits(self, X): model = Kmeans(3) N, D = X.shape splits = np.empty((D * model.k, )) for d in range(D): conSplit = X[:, d] conSplit = np.array(conSplit).transpose() model.fit(conSplit) for i in range(model.k): splits[d] = model.means[i, ] self.thresholds = np.unique(self.means)
def main(): df = bdd.date_dir(path) idx, mtx = bdd.df2np(df) del df km = Kmeans(mtx, nb_cluster=nb_cluster, cpu=cpu, methode_dist=methode_dist, adr=img_dir, index=idx) km.run_global(grphq=True, choose_nb_graph=True) km.save(km_path) print("\n{}".format(km)) return None
def quantize(self, img): """ Quantizes an image into 2^b clusters Parameters ---------- img : a (H,W,3) numpy array Returns ------- quantized_img : a (H,W) numpy array containing cluster indices Stores ------ colours : a (2^b, 3) numpy array, each row is a colour """ H, W, D = img.shape # model = KMeans(n_clusters=2**self.b, n_init=3) model = Kmeans(k=2**self.b) X = np.reshape(img, (H * W, 3)) model.fit(X) y = model.predict(X) print(y.shape) # self.y=y # self.center=model.means # Reshape 2D-matrix to 3D-img # quantized_img = img # X=np.reshape(img,(H*W,3)) # model.fit(X) # y=model.predict(X) # m=y.shape # print(m) # quantized_img=y self.colours = np.zeros((2**self.b, 3), dtype='uint8') # ,dtype='uint8') for i in range(2**self.b): # img[i, :] = quantized_img[i] self.colours[i, :] = model.means[i, :] img = np.zeros((H * W), dtype='uint8') for i in range(H * W): img[i] = y[i] img = np.reshape(img, (H, W)) quantized_img = img # TODO: fill in code here # raise NotImplementedError() return quantized_img
def create_splits(self, X): #k value obtained via elbow method N, D = X.shape splits = [] for i in range(D): model = Kmeans(k=10) #all values in an example vec = X[:, i].reshape(N, 1) model.fit(vec) threshs = model.means splits.append(np.squeeze(threshs)) self.thresholds = splits
def quantize_image(self, img): # w, h, d = img.shape w, h, d = original_shape = tuple(img.shape) resized_image = np.reshape(img, (w * h, d)) model = Kmeans(k=(2**self.b)) model.fit(resized_image) labels = model.predict(resized_image) self.means = getattr(model, "means") print("Cluster Assignments") print(labels) return labels
def main(): df = bdd.concat_dir(path) df = bdd.drop_profile(df, drop_var) df = bdd.bdd2bow(df) idx, mtx = bdd.df2np(df) col = df.columns.values.astype(str) del df km = Kmeans(mtx, nb_cluster=nb_cluster, cpu=cpu, methode_dist=methode_dist, adr=img_dir, index=idx) km.run_global(choose_nb_graph=True) bdd.print_means_words(km, col) km.save(km_path) print("\n{}".format(km)) return None
def main(): KMEANS = leerModelo() if (KMEANS == None): print('NO EXISTE UN MODELO') spotify = SpotifyPro() df = spotify.iniciar(idPlaylist='1QP6tyANnZZ9bRTfQG4X7a') k = Kmeans(df) # contiene red y datasets if (len(df)): k.importarDatos() if (k.red != None): guardarModelo(k) else: print('Ya existe un modelo')
def dequantize_image(self, img): w, h, d = img.shape resized_image = np.reshape(img, (w * h, d)) original_image = np.zeros(img.shape) model = Kmeans(k=(2**self.b)) model.fit(resized_image) labels = model.predict(resized_image) self.means = getattr(model, "means") label_idx = 0 for i in range(w): for j in range(h): original_image[i][j] = self.means[labels[label_idx]] label_idx += 1 return original_image
def closure_1_3_1(): k = 4 best_model = None min_error = np.inf for i in range(50): model = Kmeans(k) model.fit(X) error = model.error(X) if error < min_error: min_error = error best_model = model plt.figure() utils.plot_2dclustering(X, best_model.predict(X)) fname = os.path.join("..", "figs", "kmeans_outliers_best_model.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname)
def test_kmeans(self): exact_labels = [] label_1 = "Iris-setosa" label_2 = "Iris-versicolor" label_3 = "Iris-virginica" for item in self.data["label"]: if item == label_1: exact_labels.append(0) elif item == label_2: exact_labels.append(1) else: exact_labels.append(2) k = 3 kmeans = Kmeans(k) X_train, X_test, y_train, y_test = train_test_split(self.features, exact_labels, test_size=0.33, random_state=42) kmeans.load_data(X_train.to_numpy().tolist()) kmeans.train() labels = kmeans.predict(X_test.to_numpy().tolist()) accurate_sum = 0 for i in range(len(labels)): if labels[i] == y_test[i]: accurate_sum += 1 print("Akurasi K-Means: ", accurate_sum / len(labels)) kmeans_sklearn = KMeans(n_clusters=3) kmeans_sklearn.fit(X_train) sklearn_accurate_sum = 0 for i in range(len(labels)): if kmeans_sklearn.labels_[i] == y_test[i]: sklearn_accurate_sum += 1 print("Akurasi K-Means sklearn: ", sklearn_accurate_sum / len(labels))
def _init_parameters(self, X, method='kmeans'): """ 初始化高斯分布的参数。 如果 method == 'kmeans',那么使用kmeans进行初始化; 如果 method == 'random',那么进行随机初始化。 """ n = X.shape[0] self.Guass = [Guass_distribution(dim=self.dim) for i in range(self.m)] if method is 'kmeans': try: kmeans = Kmeans() labels, centroids = kmeans.main(X, k=self.m, t=100, c_strategy='kmeans') except: centroids, labels = vq.kmeans2(X, self.m, minit='points', iter=1000) clusters = [[j for j in range(n) if labels[j] == i] for i in range(self.m)] elif method is 'random': time_seed = int(time.time()) np.random.seed(time_seed) clusters = [[] for i in range(self.m)] centroids = random.sample(list(range(n)), self.m) # 随机生成m个中心 for i in range(n): ci = np.argmin([la.norm(X[i] - X[c]) for c in centroids]) clusters[ci].append(i) else: raise ValueError("Unknown method type!") for i in range(self.m): guass = self.Guass[i] data = X[clusters[i]] guass.init(data) guass.weight = len(clusters[i]) / n
def sliding_window_three_months(df, date): starting_date_obj = datetime.datetime.strptime( date, "%Y-%m-%d") preprocess_obj = preprocess() monthly_data = preprocess_obj.get_three_monthly_candlestick_data( df, date) kmeans = Kmeans(monthly_data) e = kmeans.get_clusters() print('original', e) ctut = [] ctlt = [] ctbl = [] ctc = [] for i in range(0, len(e)): ctut.append(e[i][0]) ctlt.append(e[i][1]) ctbl.append(e[i][2]) ctc.append(e[i][3]) candlestickst = candlestickState(ctut, ctlt, ctbl, ctc) return candlestickst
def main(): #In our dataset, our customers are id'd 0 to 999. Adjust this if dataset changes number of clients customerValue = [0] * 1000 pointsFile = open('../data/points.json') breakfastMovementFile = open('../data/breakfast.json') breakfastPurchaseFile = open('../data/breakfastbuy.json') lunchMovementFile = open('../data/lunch.json') lunchPurchaseFile = open('../data/lunchbuy.json') dinnerMovementFile = open('../data/dinner.json') dinnerPurchaseFile = open('../data/dinnerbuy.json') breakfastPurchaseData = json.load(breakfastPurchaseFile)["data"] for purchase in breakfastPurchaseData: customerValue[int(purchase["Client ID"])] += int( float(purchase["Price"])) lunchPurchaseData = json.load(lunchPurchaseFile)["data"] for purchase in lunchPurchaseData: customerValue[int(purchase["Client ID"])] += int( float(purchase["Price"])) dinnerPurchaseData = json.load(dinnerPurchaseFile)["data"] for purchase in dinnerPurchaseData: customerValue[int(purchase["Client ID"])] += int( float(purchase["Price"])) dataset = [] print("Loading dataset...") #Change the line below to use a different dataset lunchMovementsData = json.load(lunchMovementFile)["data"] for customer in lunchMovementsData: #Data is in x,y. We want row,col dataset.append([ int(customer["X"]), -int(customer["Y"]) + 55, customerValue[int(customer["ID"])] ]) print("Done!") Kmeans(4, dataset)
def clusterpixels(infile, k, steps): im = np.array(Image.open(infile)) dx = int(im.shape[0] / steps) dy = int(im.shape[1] / steps) # 计算每个组件的图像特征 features = [] for x in range(steps): for y in range(steps): R = np.mean(im[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 0]) #行、列、颜色通道 G = np.mean(im[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 1]) B = np.mean(im[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 2]) features.append([R, G, B]) features = np.array(features, 'f') # 将特征值变换为数组矩阵形式 # 聚类, k是聚类数目 centroids, variance, iternum = Kmeans(features, k) code, distance = vq(features, centroids) #进行矢量量化,使用获得的聚类标签画图 codeim = code.reshape(steps, steps) #给数组一个新的形状而不改变其数据 codeim = imresize(codeim, im.shape[:2], 'nearest') #imresize() 方法,用来指定新图像的大小 return codeim
def kmeans(imagens, segmentadas, path): k = Kmeans(imagens) qtd = 0 for i in imagens: # Leitura Imagem img = imread(imagens[qtd][0]) #img = cv2.resize(img, (segmentadas[qtd][2], segmentadas[qtd][1])) res2 = k.kmeans_seg(img, 2) / 255 res3 = k.kmeans_seg(img, 3) / 255 res9 = k.kmeans_seg(img, 5) / 255 cv2.imwrite("res2.png", res2) cv2.imwrite("res3.png", res3) cv2.imwrite("res9.png", res9) fig = plt.figure(figsize=(9,3), dpi=200) k.add_image(fig, img, 1, 4, 1, 'original') k.add_image(fig, res2, 1, 4, 2, 'k=2') k.add_image(fig, res3, 1, 4, 3, 'k=3') k.add_image(fig, res9, 1, 4, 4, 'k=5')