def init_with_kmeans(self): print(self.cols*self.rows) print(len(list(np.where(self._mask == 0))[1])) '''Initialise the BGDGMM and FGDGMM, which are respectively background-model and foreground-model, using kmeans algorithm''' max_iter = 2 # Max-iteration count for Kmeans '''In the following two indexings, the np.logical_or is needed in place of or''' self._bgd = np.where(np.logical_or(self._mask == self._GC_BGD, self._mask == self._GC_PR_BGD)) # Find the places where pixels in the mask MAY belong to BGD. self._fgd = np.where(np.logical_or(self._mask == self._GC_FGD, self._mask == self._GC_PR_FGD)) # Find the places where pixels in the mask MAY belong to FGD. self._BGDpixels = self.img[self._bgd] self._FGDpixels = self.img[self._fgd] KMB = kmeans(self._BGDpixels, dim = 3, n = self.k, max_iter = max_iter) # The Background Model by kmeans KMF = kmeans(self._FGDpixels, dim = 3, n = self.k, max_iter = max_iter) # The Foreground Model by kmeans KMB.run() KMF.run() # self._BGD_types = KMB.output() # self._FGD_types = KMF.output() # print(self._BGD_types) self._BGD_by_components = KMB.output() self._FGD_by_components = KMF.output() self.BGD_GMM = GMM() # The GMM Model for BGD self.FGD_GMM = GMM() # The GMM Model for FGD '''Add the pixels by components to GMM''' for ci in range(self.k): # print(len(self._BGD_by_components[ci])) # print(self._BGD_by_components[ci]) for pixel in self._BGD_by_components[ci]: # pixel = np.asarray([j for j in pixel], dtype = np.float32) self.BGD_GMM.add_pixel(pixel, ci) for pixel in self._FGD_by_components[ci]: self.FGD_GMM.add_pixel(pixel, ci) # for ci in range(self.k): # bgd_index = np.where(self._BGD_types == ci) # fgd_index = np.where(self._FGD_types == ci) # for pixel in self.img[bgd_index]: # self.BGD_GMM.add_pixel(pixel, ci) # for pixel in self.img[fgd_index]: # self.FGD_GMM.add_pixel(pixel, ci) self.BGD_GMM.learning() self.FGD_GMM.learning()
n = 100 * (index - 2) n = index print "K-means for " + str(n) + " centroids." center = None label = None ret = 99999999999 for _ in range(1): centroids = features[np.random.choice(range(len(features)), n, replace=False)] c_ret, c_label, c_center = kmeans(features, k = n, \ centroids = centroids, steps = 100) if c_ret < ret: ret = c_ret label = c_label center = c_center print ret rets.append(ret) save_clustering("features/hog_cluster.bin", c_label, c_center) # plt.plot(rets) # plt.show()
pid_list, command_list, param_list = [], [], [] index_0, index_1 = [], [] print('node: ', nodes[i][0]) for param in params[i]: if param[2] is not None: pid_list.append(param[0]) command_list.append(param[1]) param_list.append(param[2]) dataset = np.vstack((param_list, param_list)) dataset = np.transpose(dataset) # kmeans clustering k = 2 dataset = np.mat(dataset) if len(dataset) > 0: centroids, cluster_assment = km.kmeans(dataset, k) for index, value in enumerate(cluster_assment[:, 0]): if value == 0: index_0.append(index) if value == 1: index_1.append(index) print('kmeans: ', param_i) if len(index_0) <= len(index_1): print('command: ', [command_list[j] for j in index_0]) #print('pid: ', [pid_list[j] for j in index_0]) else: print('command: ', [command_list[j] for j in index_1]) #print('pid: ', [pid_list[j] for j in index_0]) print() # 2d plot
def k_means_RBFS(training_samples, N): rbfs = kmeans(training_samples, N) sigma = 0.1*( rbfs.max() - rbfs.min() )/ np.sqrt(2*N) return rbfs, sigma
# coding=utf-8 import sys import os root_path = os.getcwd() # 获得当前py的根目录 sys.path.append(root_path + '/Other') # 导入Other文件夹 from OnePassCluster import * import k_means from sklearn import preprocessing if __name__ == '__main__': vectors = np.loadtxt('Other/dataSet/dim1024.txt') # k=16 threshold=0.1927 # vectors = np.loadtxt('Other/dataSet/g2-256-100.txt') # 0.192717136469125 # 归一化(Normalization) vectors = preprocessing.normalize(vectors) # 归一化 # print vectors k = 16 t1 = time.time() cluster_result = np.array(k_means.kmeans(k=k, vectors=vectors)) t2 = time.time() print "k-means spend time %.9fs" % ((t2 - t1) / 1000) print cluster_result for i in range(k): print i, np.where(cluster_result == i)[0] print '-------' o_p_c = OnePassCluster(t=0.1927, vector_list=vectors) o_p_c.print_result()
def _parse_address(self): ''' Convert address to binary arrays. ''' data = [] print "Parsing address." print len(np.unique(self.data[:, 6])) for index in range(len(self.data[:, 6])): splitted_address = self.data[:, 6][index].split(' ', 1) if self._is_int(splitted_address[0]): data.append(splitted_address[1]) else: data.append(self.data[:, 6][index]) self.data[index][6] = data[index] labels = np.unique(data) data = np.array(data) means_address = [] for index_label in range(len(labels)): address_data = self.data[np.where(data == labels[index_label])] means_address.append([np.mean(address_data[:, 7]), np.mean(address_data[:, 8])]) means_address = np.float32(np.array(means_address)) cond_1 = means_address[np.where(means_address[:, 1] > 85)] centroid1 = cond_1[0] cond_2 = means_address[np.where(np.logical_and(means_address[:, 1] > 68, means_address[:, 1] < 80))] centroid2 = cond_2[0] cond_3 = means_address[np.where(np.logical_and(means_address[:, 1] > 50, means_address[:, 1] < 70))] centroid3 = cond_3[0] total = len(means_address) # centroid1 = means_address[np.random.randint(total)] # centroid2 = means_address[np.random.randint(total)] # centroid3 = means_address[np.random.randint(total)] rets = [] # for n in range(10, 11): for n in range(100, 101): center = None label = None ret = 9999999 for _ in range(5): centroids = [] if n == 2: centroids.append(centroid2) centroids.append(means_address[np.random.randint(total)]) centroids = np.float32(centroids) elif n == 3: centroids.append(centroid1) centroids.append(centroid2) centroids.append(means_address[np.random.randint(total)]) centroids = np.float32(centroids) elif n == 4: centroids.append(centroid1) centroids.append(centroid2) centroids.append(centroid3) centroids.append(means_address[np.random.randint(total)]) else: centroids.append(centroid1) centroids.append(centroid2) centroids.append(centroid3) centroids = np.float32(centroids) centroids = np.vstack((centroids, means_address[np.random.choice(range(len(means_address)), n - 3)])) while len(np.unique(centroids)) < n: centroids = np.unique(centroids) x = np.random.rand() * np.mean(centroids[:, 0]) y = np.random.rand() * np.mean(centroids[:, 1]) centroids = np.vstack((centroids, [x, y])) c_ret, c_label, c_center = kmeans(means_address, k = n, \ centroids = centroids, steps = 1000) if c_ret < ret: ret = c_ret label = c_label center = c_center print ret rets.append(ret) # plt.plot(rets) # plt.show() # Now separate the data, Note the flatten() # Plot the data # plt.scatter(means_address[:, 0], means_address[:, 1]) # # plt.scatter(center[:, 0], center[:, 1], s = 80, c = 'y', marker = 's') # plt.xlabel('Height'), plt.ylabel('Weight') # plt.show() for index_label in range(len(labels)): data[np.where(data == labels[index_label])] = "A" + str(label[index_label]) labels = np.unique(data) data = self._binarize_feature(data, labels) del means_address del splitted_address del address_data del rets del center del label del c_label del c_center del centroids return labels, data
from numpy import genfromtxt from sklearn import svm, metrics from sklearn.neighbors import KNeighborsClassifier import scipy.io from k_means import kmeans data = scipy.io.loadmat('dataku.mat')["dataimage_plus"] #print(data) #data = genfromtxt('dataimage.csv', delimiter=',') n_samples = len(data) #split data input and data output data_i = data[:,0:2] #2:4 => fft (mean dan standard deviasi FFT), 0:2 => (entropy dan energi GLCM) data_o,c = kmeans(data_i,3) print len(data_o) data_expected = data[:,4] - 1 print len(data_expected) #split 50 % data (data training) # data_i_train = data_i[0:][::2] # data_o_train = data_o[0:][::2] # #split 50 % data (data testing) # data_i_test = data_i[1:][::2] # data_o_test = data_o[1:][::2] # #create knn classifier # neigh = KNeighborsClassifier(n_neighbors=3)
raw_data = mat73.loadmat('./data/kmeans_pts.mat') gr_list = [t[0] for t in raw_data['Data']['gr_pts']] n_itrs = 20 #finish this and run by tomorrow morning. cluster_purities = pd.DataFrame( columns=['Algorithm', 'NumberClusters', 'ClusterPurity']) for k in range(5, 30, 5): print('.') print('.') print('.') for exp in range(10): # for exp in range(2): centers = km.kmeans(gr_list, k, n_itrs, 'flag') cluster_purity = km.clusterPurity(labels_true, gr_list, centers, 'flag') cluster_purities = cluster_purities.append( { 'Algorithm': 'Flag Mean', 'NumberClusters': k, 'ClusterPurity': cluster_purity }, ignore_index=True) print("Flag trial" + str(exp + 1) + " finished") print('.') centers = km.kmeans(gr_list, k, n_itrs, 'sine') cluster_purity = km.clusterPurity(labels_true, gr_list, centers, 'sine')
#pl.hist(useful_values,50, normed=1, facecolor='green', alpha=0.75) #pl.show() time_mat_PE1 = deepcopy(time_mat) time_mat_PE2 = deepcopy(time_mat) time_mat_PE3 = deepcopy(time_mat) dim_max = np.shape(time_mat) for i1 in range(0, dim_max[0]): for j1 in range(0, dim_max[1]): temp = double(time_mat[i1, j1]/500) if temp > 1: time_mat_PE2[i1, j1] = 500 if temp <0: time_mat_PE2[i1, j1] = 0 centroids, clusterAssment = kmeans(time_mat, 5) print (clusterAssment) print (np.shape(clusterAssment)) #showCluster(time_mat, 5, centroids, clusterAssment) ######################################## POINT ##### first_line = ['pointId', 'lon', 'lat', 'alt', 'valueOfTime1', 'valueOfTime2', 'valueOfTime3', 'valueOfTime4', 'valueOfTime5', \ 'valueOfTime6', 'valueOfTime7', 'valueOfTime8', 'valueOfTime9', 'valueOfTime10', 'valueOfTime11', 'valueOfTime12', 'valueOfTime13'] outfp_o_cl1.writerow(first_line) outfp_o_cl2.writerow(first_line) outfp_o_cl3.writerow(first_line) outfp_o_cl4.writerow(first_line) outfp_o_cl5.writerow(first_line)
print("=================================================================") ''' K-Means Algorithm ''' print("\n\nK-means Clustering") k = input("Enter K: ") max_iterations = input("Enter Maximum iterations: ") min_df = input("Enter Minimum document Frequency: ") print("=================================================================") k = int(k) max_iterations = int(max_iterations) min_df = int(min_df) start = time.time() Kmeans = k_means.kmeans() # Min Document Frequency is used as Feature Selcection Parameter y_pred, labels = Kmeans.clustering(X, k, max_iterations, min_df) # Vectors and Features of X in K-means Clustering ''' k_means_vector = Kmeans.vectors k_means_features = Kmeans.features ''' contingency_matrix = metrics.cluster.contingency_matrix(y, y_pred) score = purity_score(y, y_pred) print("Purity: " + str(score)) end = time.time() tot = end - start
from numpy import genfromtxt from sklearn import svm, metrics from sklearn.neighbors import KNeighborsClassifier import scipy.io from k_means import kmeans data = scipy.io.loadmat('dataku.mat')["dataimage_plus"] #print(data) #data = genfromtxt('dataimage.csv', delimiter=',') n_samples = len(data) #split data input and data output data_i = data[:,0:2] #2:4 => fft (mean dan standard deviasi FFT), 0:2 => (entropy dan energi GLCM) data_o,c = kmeans(data_i,3) #print data_o data_exp = data[:,4] #split 50 % data (data training) data_i_train = data_i[0:][::2] data_o_train = data_o[0:][::2] #split 50 % data (data testing) data_i_test = data_i[1:][::2] data_o_test = data_exp[1:][::2] - 1 #create knn classifier neigh = KNeighborsClassifier(n_neighbors=3) #We learn the digits on the first half of the digits
# coding=utf-8 ''' Author: ripples Email: [email protected] date: 2020/3/11 15:34 desc: ''' # 可以做一个关于数据集是否混乱的对比 import sys import copy sys.path.append('../') import numpy as np import matplotlib.pyplot as plt from k_means import kmeans path = '../iris/iris.data' x = kmeans(path, 3) x.cal() # x.plot_label_true()
import scipy.io as sio import numpy as np import mat73 import center_algorithms as ca import matplotlib.pyplot as plt import k_means as km import seaborn as sns import pandas as pd labels_raw = sio.loadmat( './data/kmeans_action_labels.mat')['kmeans_action_labels'] labels_true = [l[0][0] for l in labels_raw['labels'][0][0]] # labelidxs =labels_raw['labelidxs'][0][0][0] raw_data = mat73.loadmat('./data//kmeans_pts.mat') gr_list = [t[0] for t in raw_data['Data']['gr_pts']] n_itrs = 20 k = 15 centers = km.kmeans(gr_list, k, n_itrs, 'flag') cluster_purity = km.clusterPurity(labels_true, gr_list, centers, 'flag')
import numpy as np import matplotlib.pyplot as plt from k_means import kmeans # исходные данные X = np.array([ [4, 4], [3, 3], [5, 3], [2, 3], [5, 5], [3, 2], [2, 4], [4, 5], [5, 4], [2, 2]]) # запуск кластеризации ans = kmeans(2, X) # отображение результатов print(ans) plt.plot(X[:,0], X[:,1], 'bx', ans[:,0], ans[:,1], 'r*', markersize=20) plt.grid() plt.show()
output = lda.final_output with open("cluster_images/top10topicswith10wordswithoutprobs.csv", "a") as top: top.write("num_topics=" + str(num_topics) + "_no_above=" + str(no_above).replace(".", "") + ",") topics = lda.get_topics(num_words=10, probs=False) for topic_words in topics: top.write(str(topic_words) + ",") top.write("\n") with open("centroids.csv", "a") as top: top.write(",") for i in range(1, 11): top.write("cluster" + str(i) + ",") top.write("\n") for k in [8]: k_means = kmeans(output, "histograms") labels, score = k_means.cluster(k) with open("centroids.csv", "a") as top: top.write("num_topics=" + str(num_topics) + "_no_above=" + str(no_above).replace(".", "") + "_k=" + str(k) + ",") centroids = k_means.centroids for center in centroids: for counter, topics in enumerate(center): top.write(str(counter + 1) + "= " + str(topics) + " ") top.write(",") #top.write("\n") k_means.plot_histogram2( "num_topics=" + str(num_topics) + "_no_above=" +
""" Created on Tue Oct 10 11:40:16 2017 @author: xuwh """ from numpy import * import time import matplotlib.pyplot as plt import types import k_means ## step 1: load data print("step 1: load data...") dataSet = [] fileIn = open(r'D:\Python\MachineLearningInAction\testSet.txt') for line in fileIn.readlines(): lineArr = line.strip().split() #print lineArr[0],lineArr[1] #dataSet.append([float(lineArr[0]),float(lineArr[1])]) dataSet.append([float(lineArr[0])]) print("step 2: clustering...") #这里使用mat将dataSet数据转换为矩阵之后才能进行线性代数操作 dataSet = mat(dataSet) k = 4 centroids, clusterAssment = k_means.kmeans(dataSet, k) # step 3: show the result print("step 3: show the result...") #plt.plot() k_means.showCluster(dataSet, k, centroids, clusterAssment)
""" Created on Sat Oct 25 20:34:32 2014 @author: Imane """ import numpy as np import matplotlib.pyplot as plt #from os import listdir #from os.path import isfile, join #from zscoring import zscoringNII #from masking import maskdata from sklearn.decomposition import PCA from k_means import kmeans #Applying PCA and plotting fn = "dataZM\dataMask2.npy" d=np.load(fn) pca = PCA(n_components=2) pca.fit(d) dpca=pca.transform(d) plt.scatter(dpca[:,0], dpca[:,1], marker='o', color='b') #Applying kmeans and plotting idx, ctrs = kmeans(dpca, 2) plt.scatter(dpca[(idx==0),0], dpca[(idx==0),1], marker='o', color='r') plt.scatter(dpca[(idx==1),0], dpca[(idx==1),1], marker='o', color='b') plt.scatter(ctrs[:,0], ctrs[:,1], marker='o', color='k', linewidths=5)