def compute_log_inertia(X, n_clusters, T, bb_min, bb_max, random_state=0): """Compute the log inertia of X and X_t. Parameters ---------- X: array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. n_clusters: int The desired number of clusters. T: int Number of draws of X_t. bb_min: array, shape (n_features,) Inferior corner of the bounding box of X. bb_max: array, shape (n_features,) Superior corner of the bounding box of X. random_state: int, defaults to 0. A random number generator instance. Returns ------- log_inertia: float Log of the inertia of the K-means applied to X. mean_log_inertia_rand: float Mean of the log of the inertia of the K-means applied to the different X_t. std_log_inertia_rand: float Standard deviation of the log of the inertia of the K-means applied to the different X_t. """ nb_experiences = 100 log_inertia = np.log(kmeans(X, n_clusters, show=False)[2]) experiences = [] np.random.seed(random_state) for _ in range(nb_experiences): Xt = np.random.uniform(bb_min, bb_max, size=(T, 2)) experiences.append(np.log(kmeans(Xt, n_clusters, show=False)[2])) mean_log_inertia_rand = np.mean(experiences) std_log_inertia_rand = np.std(experiences) return log_inertia, mean_log_inertia_rand, std_log_inertia_rand
def test_kmeans(self): X = create_clusters([(20, 30), (20, 60), (30, 45), (40, 60)], 30, 8) dist_label, centroids = kmeans(X, 4) icons = ['b_', 'b.', 'bo', 'b+', 'b*'] for idx, l in enumerate(dist_label): plt.plot(X[idx, 0], X[idx, 1], icons[int(l[1])])
def main(): points = [np.random(2) for n in range(50)] results = kmeans(points, 5) animations = [draw_points([r[0] for r in result], [r[1] for r in result]) for result in results] write_animation("kmeans", animations)
def plot_color_clusters(colors, frequencies): centroids, clusters = kmeans(8, colors, frequencies, 1) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') r, g, b = zip(*centroids) colors = [norm_rgb(color) for color in centroids] ax.scatter(r, g, b, c=colors, s=100) for color, cluster in zip(colors, clusters): r, g, b = zip(*cluster) ax.scatter(r, g, b, c=color, s=10) ax.set_xlabel('R') ax.xaxis.label.set_color('red') ax.set_ylabel('G') ax.yaxis.label.set_color('green') ax.set_zlabel('B') ax.zaxis.label.set_color('blue') ax.set_xlim(0, 256) ax.set_xticks(range(0, 257, 32)) ax.tick_params(axis='x', colors='red') ax.set_ylim(0, 256) ax.set_yticks(range(0, 257, 32)) ax.tick_params(axis='y', colors='green') ax.set_zlim(0, 256) ax.set_zticks(range(0, 257, 32)) ax.tick_params(axis='z', colors='blue') return fig, ax
def main(): # load url content if already downloaded, # otherwise use scraper to scrape contents on the fly. if os.path.isfile(webfilename) and reload: with open(webfilename, 'rb') as file: rawtexts = pickle.load(file) else: rawtexts = [] for url in urls: rawtexts += [scrape_website(url)] with open(webfilename, "w") as file: pickle.dump(rawtexts, file) # convert raw text to vectored. each feature is hashed. text_vectors = [] for text in rawtexts: features = text_to_words(text) text_vectors += [words_to_vector(features, ndim=N)] #print(text_vectors) # apply k means algorithm. clusters, labels = kmeans(text_vectors, 3) # print(labels) # Now show content from which url belongs to which cluster for clusterindex in labels: print("cluster:" + str(clusterindex) + "\n") for urlindex in labels[clusterindex]: print("\t" + urls[urlindex])
def runKmeans(arrayP, arrayPclusters, arrayC, arrayCsum, arrayCnumpoint): # 开始计时 start = time() for i in range(REPEAT): # 使用点对象中的前k个点初始化聚类 for i1 in range(NUMBER_OF_CENTROIDS): arrayC[i1, 0] = arrayP[i1, 0] arrayC[i1, 1] = arrayP[i1, 1] arrayC, arrayCsum, arrayCnumpoint = kmeans( arrayP, arrayPclusters, arrayC, arrayCsum, arrayCnumpoint, NUMBER_OF_POINTS, NUMBER_OF_CENTROIDS ) if i + 1 == REPEAT: printCentroid(arrayC, arrayCsum, arrayCnumpoint) # 结束计时 end = time() total = (end - start) * 1000 / REPEAT print("Iterations: {:d}".format(ITERATIONS)) print("Average Time: {:.4f} ms".format(total))
def make_kmeans(): with open("./data/train.txt", 'r') as f: anchor_txt = open("anchor.txt", 'w') bo_list = [] data_lines = f.readlines() for line in data_lines: line = line.strip('\n') line = line.split(',') _boxes = np.array([float(x) for x in line[1:]]) # _boxes = np.array(list(map(float, strs[1:]))) index_box = len(_boxes) // 5 boxes = np.split(_boxes, index_box) for bo in boxes: w, h = bo[2], bo[3] bo_list.append([w, h]) data_np = np.array(bo_list) out = kmeans(data_np, 9) area_data = out[:, 0] * out[:, 1] data = out[np.argsort(area_data)] data_list1 = [str(i) for k in data[:3] for i in k] data_list2 = [str(i) for k in data[3:6] for i in k] data_list3 = [str(i) for k in data[6:9] for i in k] list1_str = ','.join(data_list1) list2_str = ','.join(data_list2) list3_str = ','.join(data_list3) anchor_txt.write(list3_str + '\n' + list2_str + '\n' + list1_str) print(out) print("Accuracy: {:.2f}%".format(avg_iou(data_np, out) * 100))
def main(fileName, k): sourceData = readData.readData(fileName) result1 = kmeans(sourceData, k) result2 = kmeansPlusPlus(sourceData, k) return result1, result2
def main(): bdd = Bdd() bdd.connect("e13") bdd.change_default_timeout(3600) #handle_file.file_insertion_handler(Bdd) x = kmeans(bdd, "facts", "end", 5) print(x) answer(bdd) bdd.disconnect()
def iris(k=3): data = pd.read_csv("test/iris.dat") (means,clusts, err) = kmeans( data.loc[:, "sepal_length":"petal_width"].values, k) f = mp.figure(1) for c in data["class"].unique(): points = data[ data["class"] == c ] mp.plot( points["sepal_length"], points["sepal_width"], "o", color=np.random.random((3, 1))) mp.plot( means[:,0], means[:,1], "ko", markersize=7) mp.show()
def compress(image_path, n_color=2, n_iterations=10, n_images=3, err_tol=100): image = cv2.imread(image_path) height = image.shape[1] width = image.shape[0] image = image.reshape(width * height, RGB_SIZE) calculated_image = np.ndarray(image.shape) (centroids, clusters) = kmeans(n_color, image, n_iters=int(n_iterations)) for key, value in clusters.items(): calculated_image[key] = centroids[value] return calculated_image.reshape(width, height, RGB_SIZE)
def run(self): self.read_input() algo = self.options['algo'] params = self.options['params'] if algo == 'kmeans': model = kmeans(self.doc, params) elif algo == 'dbscan': model = dbscan(self.doc, params) elif algo == 'agglo': model = agglo(self.doc, params) elif algo == 'minib': model = minib(self.doc, params) model.evaluate()
def iris(k=3): data = pd.read_csv("test/iris.dat") (means, clusts, err) = kmeans(data.loc[:, "sepal_length":"petal_width"].values, k) f = mp.figure(1) for c in data["class"].unique(): points = data[data["class"] == c] mp.plot(points["sepal_length"], points["sepal_width"], "o", color=np.random.random((3, 1))) mp.plot(means[:, 0], means[:, 1], "ko", markersize=7) mp.show()
def main(): #Kmeans kmeans_machine = kmeans(sample, k) kmeans_machine.train() #KNN knn_machine = knn(test, sample, target, k) print knn_machine.train() #Decision tree #SVM svm_run(Dataset, Trainset)
def restart_kmeans(data, k, times=10): minimum_objective = 0 first = True for i in range(times): clusters = initialize_cluster_centers(data, k) cluster_centers, objective = kmeans(data, clusters) if first is True: minimum_objective = objective else: if minimum_objective > objective: minimum_objective = objective print( f"Best objective of k value: {k} for {times} times: {minimum_objective}" ) return minimum_objective
def test_success_valid_learning(self): K = 3 kmeans_instance = KMeans(k = K) samples = np.array([[77.3,13.0,9.7,1.5,6.4], [82.5,10.0,7.5,1.5,6.5], [66.9,20.6,12.5,2.3,7.0], [47.2,33.8,19.0,2.8,5.8], [65.3,20.5,14.2,1.9,6.9], [83.3,10.0,6.7,2.2,7.0], [81.6,12.7,5.7,2.9,6.7], [47.8,36.5,15.7,2.3,7.2], [48.6,37.1,14.3,2.1,7.2], [61.6,25.5,12.9,1.9,7.3], [58.6,26.5,14.9,2.4,6.7], [69.3,22.3,8.4,4.0,7.0], [61.8,30.8,7.4,2.7,6.4], [67.7,25.3,7.0,4.8,7.3], [57.2,31.2,11.6,2.4,6.5], [67.2,22.7,10.1,3.3,6.2], [59.2,31.2,9.6,2.4,6.0], [80.2,13.2,6.6,2.0,5.8], [82.2,11.1,6.7,2.2,7.2], [69.7,20.7,9.6,3.1,5.9]]) init_centroid = [np.array([82.5,10.0,7.5,1.5,6.5]), np.array([47.8,36.5,15.7,2.3,7.2]), np.array([67.2,22.7,10.1,3.3,6.2])] kmeans_instance.set_samples(samples) kmeans_instance.set_centroids(init_centroid) ok_(kmeans_instance.learn()) # answer is np.kmeans book = np.array(([82.5,10.0,7.5,1.5,6.5], [47.8,36.5,15.7,2.3,7.2], [67.2,22.7,10.1,3.3,6.2])) np_kmeans_results = kmeans(samples, book) answer = np.array([np.argmin(np.sum((d - np_kmeans_results[0]) ** 2, axis = 1)) for d in samples]) answer_each_size = [np.sum(answer == k) for k in range(K)] actual_each_size = kmeans_instance.get_each_cluster_size() for i, v in enumerate(answer_each_size): eq_(v, actual_each_size[i]) actual_assign = kmeans_instance.get_assign_list() matched_results = (answer == actual_assign) ok_(np.all(matched_results))
def improve_clast_by_disp(X, disp, matExpend, disp_eps, eps): currDisp = np.max(X - matExpend) if (abs(currDisp - disp) < disp_eps): # error return None if (len(X) <= 1): return X clasters = kmeans(X, 2, eps) labels = assign_clusters(X, clasters) new_clusters_list = [] for i in range(clasters.shape[0]): ret = improve_clast_by_disp(X[np.where(labels == i)], currDisp, clasters[i], disp_eps, eps) if (ret is None): new_clusters_list.append(clasters[i].reshape(1, X.shape[1])) else: new_clusters_list.append(ret) new_clusters = np.concatenate(new_clusters_list) return new_clusters
def plot_best(): best = { "clustering1": 2, "clustering2": 3, "clustering3": 4, "clustering4": 5, } for key, value in best.items(): data = eval(key) clusters = initialize_cluster_centers(data, value) cluster_centers, objective = kmeans(data, clusters) labels = assign_clusters(data, cluster_centers) plt.scatter([x[0] for x in data], [x[1] for x in data], c=labels) plt.scatter([x[0] for x in cluster_centers], [x[1] for x in cluster_centers], c='r', marker='P') plt.savefig('report/kmeans-' + key + '-' + str(value) + '.png', bbox_inches='tight') plt.clf()
j = j + 1 if j > printEntries: print("\nVocab size = " + str(len(words)) + "\n") break if __name__ == "__main__": #Assign values to all arguments. pathOfVectors = "data\\agentlogsVecsAscii.txt" #By default vectorDimLen = 200 #By default regexs = ["diskhealthmonitor", "createcontainer"] #By default if len(sys.argv) > 1: vectorDimLen = int(sys.argv[1]) if len(sys.argv) > 2: pathOfVectors = sys.argv[2] if len(sys.argv) > 3: regexs = sys.argv[3:] print "vector dimension should be: " + str(vectorDimLen) pv = plotVecs(vectorDimLen, pathOfVectors, regexs) #Try k-means clustering k = 3 res = kmeans(pv.X, np.array(sample(pv.X, k))) for i in xrange(k): splitClusterWords(np.array(pv.rawLogs)[res[1] == i]) #Give optics a shot pv.X = np.array(pv.X) optics(pv.X)
from pylab import * from pyIOUtils import * from kmeans import * data = array(readMatFile("alldata.mat")) allsmallsets = set() l = 256 for i in range(20): members = kmeans(data, 2, l)[1] smallset = members[1] if len(members[0]) < len(members[1]): smallset = members[0] for j in smallset: allsmallsets.add(j) print allsmallsets m = 256 for l in range(1, 256, 3): allsmallsetscomp = set() avg = 0.0 for t in range(20): R = randn(m, l) pdata = dot(data, R) * (1. / float(m))**.5 members = kmeans(pdata, 2, l)[1] smallset = members[1] if len(members[0]) < len(members[1]): smallset = members[0] for j in smallset: allsmallsetscomp.add(j) uall = float(len(allsmallsetscomp.union(allsmallsets))) iall = float(len(allsmallsetscomp.intersection(allsmallsets)))
# MAIN PART # read data from the multi databases data3G1 = readData('../Databases/SimpleOCR/DB3/G1',False) data3G2 = readData('../Databases/SimpleOCR/DB3/G2',False) data2G1 = readData('../Databases/SimpleOCR/DB2/G1',False) data2G2 = readData('../Databases/SimpleOCR/DB2/G2',False) data1G12 = readData('../Databases/SimpleOCR/DB1',True) print len(data1G12[0]) data = data3G1 + data3G2 #data = [[1,2],[3,4],[1,1]] clusters=kmeans(data,10,0.001) for idx,i in enumerate(data2G1): print "Numero",idx print "classificado ", classify(clusters,i) for idx,i in enumerate(data2G2): print "Numero",idx print "classificado ",classify(clusters,i) for idx,i in enumerate(data1G12): print "Numero ",idx print "classificado ",classify(clusters,i)
from tf_idf import * from data import * from rfm import * from evaluation import get_score ''' Author : Wen-Han Hu ''' # load the data after preprocessing df = load_data() # build typical rfm print("Building typical RFM model") typical_rfm = rfm(df) matrix = rfm_matrix(typical_rfm) clusters = kmeans(matrix=matrix, cluster_num=4) #typical_rfm = rfm_write_back(typical_rfm,clusters) result = get_score(matrix, clusters, 'Typical RFM') # build stock_id rfm print("Building StockID RFM model") stock_rfm = rfm(df, model_type='StockID') #stock_rfm = rfm_transform(stock_rfm) matrix = rfm_matrix(stock_rfm, model_type=1) clusters = kmeans(matrix=matrix, cluster_num=5) #stock_rfm = rfm_write_back(stock_rfm,clusters) result = get_score(matrix, clusters, 'StockID RFM', result, flag=1) # build tf-idf rfm print("Building TF-IDF RFM model") matrix = tf_idf(df)
def test_kmeans(dim, kc, kn, m): result = kmeans(dim, kc, kn, m) title = "prediccionesKmeans.csv" save_csv(result, title)
def kMeansCol(mu_c, sig_c, n_iter = 100, n_clusters = 3, delta = 0.001, verbose = 2): """ mu_c and sig_c have the same shape as OSMatrix. mu_c and sig_c are the cropped mu and sigma for every region of the OSMatrix """ centroids = np.empty((mu_c.shape), dtype = 'object') weights = np.empty((mu_c.shape), dtype = 'object') for i in range(mu_c.shape[0]): for j in range(mu_c.shape[1]): mu_sigma = np.array([mu_c[i,j].ravel(), sig_c[i,j].ravel()]).T data = mu_sigma X = data ncluster = n_clusters kmdelta = delta kmiter = n_iter if X.shape[0] <= ncluster: ncluster = 1 centres, xtoc, dist = kmeans(data = X, nclusters = ncluster, niter = n_iter, delta = delta,datatype = 2, verbose = False) centroids[i,j] = centres wt = Counter( xtoc ) #wt = [(g[0], len(list(g[1]))) for g in itertools.groupby(xtoc)] wtx = wt.items() if len(wtx) == 1: wt = [1.0] else: wt = [sec for (one,sec) in wtx] one = [one for (one,sec) in wtx] wt = [1 - (x/sum(wt)) for x in wt] #1 - (wt/sum(wt)) wt = [(1 - x) for x in wt] weights[i,j] = wt # print wt # print centres # print xtoc mean_centroids = (centres[:,0]) variance_centroids = (centres[:,1]) mean_centroids = [x.flatten() for x in mean_centroids] #print mean_centroids #mean_centroids = np.reshape(mean_centroids, (len(centres), 2)) #variance_centroids = np.reshape(variance_centroids, (len(centres),2,2)) #print mean_centroids.shape, variance_centroids.shape colors = ['r', 'g', 'b'] # length of this should be `k` #fig = figure() #ax = fig.add_subplot(111, aspect='equal') #m,v = centres #print m #print wt ##### Plot Clusters :- # wt = map(int, wt) # maxwt = np.max(wt) # minwt = np.min(wt) # mc = [(x - minwt)/(maxwt - minwt) for x in wt] # mc = softmax(mc) # X = gmm.sample_gaussian_mixture(mean_centroids, variance_centroids, samples = 100) # plot(X[:,0], X[:,1], '.') # for j in range(len(mc)): # x1,x2 = gmm.gauss_ellipse_2d(mean_centroids[j], variance_centroids[j]) # plot(x1,x2,colors[j], linewidth = 2) # show() ##### Plotted! ##### return centroids, weights
import os from stopwords import * from tfidf import get_all_vector from kmeans import * import shutil filepath = "F:\\PycharmProjects\\Crawl\\data.json" savepath = "F:\\PycharmProjects\\Clustering\\data" newspath = "F:\\PycharmProjects\\Clustering\\news" historypath = "F:\\PycharmProjects\\Clustering\\history" dividetotxt(filepath, savepath) stop_words_set = stop_words("F:\\PycharmProjects\\Clustering\\stopwords.txt") dataset = get_all_vector(savepath, historypath, stop_words_set) result = kmeans(dataset[1], 8) if os.path.exists(newspath): shutil.rmtree(newspath) os.makedirs(newspath) resultpaths = [] for i in range(result[1].shape[0]): temp = dataset[0][i].rfind("\\") + 1 sort = int(result[1].tolist()[i][0]) resultpath = newspath + "\\" + str(sort) if resultpath not in resultpaths: resultpaths.append(resultpath) if not os.path.exists(resultpath): os.makedirs(resultpath) shutil.copyfile(dataset[0][i], resultpath + "\\" + dataset[0][i][temp:len(dataset[0][i])]) resultpaths.sort()
[ 70, 140], [ 70 ,160], [65, 132], [48, 75], [72, 175], [ 67 ,167], [69 ,140], [96, 285], [70, 172], [70, 185 ], [71, 168], [70, 180], [69 ,170], [70 ,150], [ 70 ,170 ], [71 ,144], [ 66 ,140], [67, 175], [ 67, 165], [ 72 ,175] ]) matrix = matrix.astype(np.float64) for row in matrix: print row std_matrix = standardizeData(matrix) for row in std_matrix: print row kmeans(matrix, 2)
cls = np.argmax(gmm.resp, axis=1) # print(cls) clr = ["r", "g", "gold", "brown", "black"] for k in range(KK): index_k = np.where(cls == k)[0] x_k = gmm.data[index_k].T plt.scatter(x_k[0], x_k[1], s=15, c=clr[k]) plt.title('em gmm') plt.show() if case == "kmeans": KK = 4 gmm = EM_GMM(K=KK, data="40") gmm.means = kmeans(data=gmm.data, K=KK) pt = gmm.data.T fig, ax = plt.subplots() gmm.EM() print(gmm.coeff) print(gmm.means) print(gmm.covar) for i in range(KK): v, w = np.linalg.eigh(gmm.covar[i]) v = 2. * np.sqrt(2.) * np.sqrt(v) u = w[0] / np.linalg.norm(w[0]) angle = np.arctan(u[1] / u[0]) angle = 180. * angle / np.pi # convert to degrees ell = matplotlib.patches.Ellipse(gmm.means[i], v[0], v[1],
#/usr/bin/python from kmeans import * from numpy import * import time import matplotlib.pyplot as plt ## step 1: load data print "step 1: load data..." dataSet = [] fileIn = open('./txt/spec-429-100M.txt') #fileIn = open('./123.txt') for line in fileIn.readlines(): lineArr = line.strip().split('\t') dataTmp = [] for num in range(len(lineArr)): dataTmp.append(float(lineArr[num])) dataSet.append(dataTmp) ## step 2: clustering... print "step 2: clustering..." dataSet = mat(dataSet) k = 1 centroids, clusterAssment = kmeans(dataSet, k) ## step 2.5: delete the furthest 0.1% points # step 3: show the result #print "step 3: show the result..." #showCluster(dataSet, k, centroids, clusterAs`sment)
from pylab import * from pyIOUtils import * from kmeans import * data = array(readMatFile("alldata.mat")) allsmallsets = set() l = 256 for i in range(20): members = kmeans(data,2,l)[1] smallset = members[1] if len(members[0])<len(members[1]): smallset = members[0] for j in smallset: allsmallsets.add(j) print allsmallsets m=256 for l in range(1,256,3): allsmallsetscomp = set() avg = 0.0 for t in range(20): R = randn(m,l) pdata = dot(data,R)*(1./float(m))**.5 members = kmeans(pdata,2,l)[1] smallset = members[1] if len(members[0])<len(members[1]): smallset = members[0] for j in smallset: allsmallsetscomp.add(j) uall = float(len(allsmallsetscomp.union(allsmallsets))) iall = float(len(allsmallsetscomp.intersection(allsmallsets)))
def kMeansInt(mu_c, sig_c, n_iter = 100, n_clusters = 3, delta = 0.001, verbose = 2): """ mu_c and sig_c have the same shape as OSMatrix. mu_c and sig_c are the cropped mu and sigma for every region of the OSMatrix """ centroids = np.empty((mu_c.shape), dtype = 'object') weights = np.empty((mu_c.shape), dtype = 'object') #print mu_c[0,0].shape, mu_c[1,0].shape, mu_c[2,0].shape for i in range(mu_c.shape[0]): for j in range(mu_c.shape[1]): mu_sigma = np.array([mu_c[i,j].ravel(), sig_c[i,j].ravel()]).T #print mu_sigma.shape data = mu_sigma X = data ncluster = n_clusters kmdelta = delta kmiter = n_iter #print X.shape, ncluster if X.shape[0] <= ncluster: ncluster = 1 centres, xtoc, dist = kmeans(data = X, nclusters = ncluster, niter = n_iter, delta = delta,datatype = 1, verbose = False) centroids[i,j] = centres wt = Counter( xtoc ) #print wt #wt = [(g[0], len(list(g[1]))) for g in itertools.groupby(xtoc)] wt = wt.items() print wt wt = [sec for (one,sec) in wt] print wt if len(wt) == 1: wt = [1.0] else: wt = [1 - (x/sum(wt)) for x in wt] #1 - (wt/sum(wt)) wt = [(1 - x) for x in wt] weights[i,j] = wt print wt # idx = xtoc # centroids1 = centres # plot(data[idx==0,0],data[idx==0,1],'ob', # data[idx==1,0],data[idx==1,1],'or', # data[idx==2,0],data[idx==2,1],'og', # data[idx==3,0],data[idx==3,1],'oy', # data[idx==4,0],data[idx==4,1],'oc') # plot(centroids1[:,0],centroids1[:,1],'sg',markersize=8) # show() #print centroids[0,0].shape, weights[0,0].shape return centroids, weights
os.system("./a.out X.mat " + str(k)) means = readMatFile("out.mat") rp2Avg.append(getLabelAccuracy(means,testX,testY,[k])) os.system("./rp1.out X.mat " + str(k)) means = readMatFile("out.mat") rp1Avg.append(getLabelAccuracy(means,testX,testY,[k])) start = time.time() #standard kmeans means,clusters = kmeans(trainX,k,h) print time.time()-start kmAvg.append(getLabelAccuracy(means,testX,testY,[k])) del(X,Y,trainX,trainY,testX,testY) rp2AllhashPR.append(sum(rp2AllAvg)/float(av)) rp2hashPR.append(sum(rp2AllAvg)/float(av)) rp1hashPR.append(sum(rp2AllAvg)/float(av)) kmeansPR.append(sum(kmAvg)/float(av)) dimlist.append(h) mrp2All = sum(rp2AllAvg)/float(av) mrp2 = sum(rp2Avg)/float(av)
# MAIN PART # read data from the multi databases data3G1 = readData('../Databases/SimpleOCR/DB3/G1', False) data3G2 = readData('../Databases/SimpleOCR/DB3/G2', False) data2G1 = readData('../Databases/SimpleOCR/DB2/G1', False) data2G2 = readData('../Databases/SimpleOCR/DB2/G2', False) data1G12 = readData('../Databases/SimpleOCR/DB1', True) print len(data1G12[0]) data = data3G1 + data3G2 #data = [[1,2],[3,4],[1,1]] clusters = kmeans(data, 10, 0.001) for idx, i in enumerate(data2G1): print "Numero", idx print "classificado ", classify(clusters, i) for idx, i in enumerate(data2G2): print "Numero", idx print "classificado ", classify(clusters, i) for idx, i in enumerate(data1G12): print "Numero ", idx print "classificado ", classify(clusters, i) showClusters( clusters
cx = np.sum(cm, axis=0) cy = np.sum(cm, axis=1) tp, fp, fn, FM = [], [], [], [] for j in range(10): tp.append(cm[j, j]) fp.append(cx[j] - cm[j, j]) fn.append(cy[j] - cm[j, j]) FM.append(((cm[j, j] / cx[j]) * (cm[j, j] / cy[j]))**0.5) F_M = 0 for temp in FM: F_M = float(F_M + temp) print("The Fowlkes–Mallows index is: ", float(F_M / 10)) centroids, clusterAssment = kmeans(dataSet, k) pre = [] for i in range(numSamples): pre.append(clusterAssment[i, 0]) print("Result of kmeans:") result(pre) ward = cluster.AgglomerativeClustering(n_clusters=k, linkage='ward') pre = ward.fit_predict(digits.data) print("Result of Agglomerative clustering with Ward linkage:") result(pre) affinity_propagation = cluster.AffinityPropagation() pre = affinity_propagation.fit_predict(digits.data) print("Result of cluster fot AffinityPropagation:") print(pre)
import matplotlib.pyplot as plt from kmeans import * ## step 1: load data # print ("step 1: load data..." ) weight = np.load("./save_np/fc_w_noprune.npz") fc1_w = np.mat(weight['fc3_w']) fc1_w = fc1_w.reshape([192 * 10, 1]) print(fc1_w.shape) print(type(fc1_w)) print(fc1_w) ## step 2: clustering... print("step 2: clustering...") k = 10 centroids, clusterAssment = kmeans(fc1_w, k) #调用KMeans文件中定义的kmeans方法。 clusterAssment = clusterAssment[:, 0] print(centroids) print(clusterAssment) np.savez("./save_kmeans/fc3_w_clusterAssment.npz", fc1_w_clusterAssment=clusterAssment) ## step 3: show the result # print ("step 3: show the result..." ) # showCluster(fc1_w, k, centroids, clusterAssment) # fc1_w_mask = np.load("./save_kmeans/fc2_w_clusterAssment.npz") # fc1_w_clusterAssment = np.mat(fc1_w_mask['fc1_w_clusterAssment']) # a = fc1_w_clusterAssment.reshape([120, 84]) # print(a.shape) # print(type(a)) # print(a)
if j>printEntries: print("\nVocab size = " + str(len(words)) + "\n") break if __name__ == "__main__": #Assign values to all arguments. pathOfVectors = "data\\agentlogsVecsAscii.txt" #By default vectorDimLen = 200 #By default regexs = ["diskhealthmonitor", "createcontainer"] #By default if len(sys.argv) > 1: vectorDimLen = int(sys.argv[1]) if len(sys.argv) > 2: pathOfVectors = sys.argv[2] if len(sys.argv) > 3: regexs = sys.argv[3:] print "vector dimension should be: " + str(vectorDimLen) pv = plotVecs(vectorDimLen, pathOfVectors, regexs) #Try k-means clustering k = 3 res = kmeans(pv.X,np.array(sample(pv.X, k))) for i in xrange(k): splitClusterWords(np.array(pv.rawLogs)[res[1] == i]) #Give optics a shot pv.X = np.array(pv.X) optics(pv.X)
# plt.title('Cumulative variance vs PC') # plt.show() exit() # question 3.3 from kmeans import * # this script contains everything for kmeans algorithm running k = 4 initialIndices = np.array([0, 1, 2, 3]) # first 4 data point will be initial clusters ###initialIndices=np.array(random.sample(range(0, len(data)), k)) #if want random # run kmeans new_centers, _ = kmeans(data, initialIndices, k) #adding centeres as new datapoints to the original dataset data2 = np.vstack([data, new_centers]) #print data2.shape #[1279,1568] #running PCA on the new dataset _, _, Y_centers = apply_pca(data2) #adding color labels for the cluster centers col_labels.extend(['yellow', 'yellow', 'yellow', 'yellow']) #####plotting PC1_centers = Y_centers[:, 0] PC2_centers = Y_centers[:, 1] print Y_centers.shape
if __name__ == '__main__': k = 16 img = Image.open('bird_small.png').getdata() # picture size leng, wid = img.size #128, 128 # number of pixels m = leng * wid data = np.array(img, dtype=np.float64) / 255 #print(data[0]) #[ 0.85882353 0.70588235 0.40392157] #print(len(data)) #16384 # original picture pixel density orgpic = data.copy() # compressed picture pixel density compic = data.copy() # initialize centroids randomly centroids = centInit(data, k, 3) idx, history = kmeans(data, centroids) # get the final converged points centroids = np.array(history[len(history) - 1]) # compress the picture, replace the sampe labeled pixels with its centroid for i in range(0, k): compic[idx == i] = centroids[i] fig, ax = plt.subplots(2) # The value for each component of MxNx3 and MxNx4 float arrays should be # in the range 0.0 to 1.0; MxN float arrays may be normalised. ax[0].imshow(orgpic.reshape((leng, wid, 3))) ax[1].imshow(compic.reshape((leng, wid, 3))) plt.show()
n = len(dataset) number_of_clusters = 5 x = dataset[:, 1] # coordinate x y = dataset[:, 2] # coordinate y pop = dataset[:, 3] # population coordinates = [(x[i], y[i]) for i in range(n)] # points # NORMAL EXECUTION # clusters = hierarchical_clustering(coordinates, number_of_clusters) # clusters = kmeans(coordinates, number_of_clusters, pop, q=6) # BEGIN DISTORTION kmeans_distortion = [] for i in range(6, 21): clusters = kmeans(coordinates, i, pop, q=5) kmeans_distortion.append(distortion(clusters)) clusters, distortion_clusters = hierarchical_clustering(coordinates, number_of_clusters) hierarchical_distortion = [distortion(c) for c in distortion_clusters] plt.plot(np.arange(6, 21), hierarchical_distortion[::-1]) plt.plot(np.arange(6, 21), kmeans_distortion) plt.legend(labels=["Hierarchical", "K-means"]) plt.xlabel("Number of clusters") plt.ylabel("Distortion") plt.title("Distortion graph of dataset: {} counties".format(f_code)) plt.gca().invert_xaxis() plt.xticks(np.arange(6, 21, 1)) plt.grid() # plt.savefig("./risposte/distortion_{}_domanda9.png".format(f_code))
import numpy as np from kmeans import * if __name__ == '__main__': fig, ax = plt.subplots() ex7data2 = np.load('ex7data2.npz') x = ex7data2['X'] m, n = x.shape k = 3 """ centroids = np.zeros((k,n)) centroids[0] = [3, 3] centroids[1] = [6, 2] centroids[2] = [8, 5] """ centroids = centInit(x, k, 2) #idx = closestCentroids(x,centroids) #print computeCentroids(x,idx,k) idx, history = kmeans(x, centroids, tol=1e-5) plotResult(ax, x, idx) moveTrace(ax, history) plt.show()
def split_tuple(centers): result = [[],[]] for i in centers: result[0].append(i[0]) result[1].append(i[1]) return result data = [] print '这是一个k-means算法及其实例程序,程序会创建随机点集并自动使用k-means算法进行聚类,并给出一个效果图(算法本身中并没有效果图的实现)' print '请输入随机点的个数:' n = input('>') print '请输入划分集合的个数k:' k = input('>') for i in range(n): data.append(produce_random_point()) centers = kmeans(data, k) centers[0] = split_tuple(centers[0]) for i in range(k): centers[1][i] = split_tuple(centers[1][i]) colors = random_color(k) fig = plt.figure(figsize=(16,12), dpi=72, facecolor="white") for i in range(k): plt.scatter(centers[1][i][0], centers[1][i][1], color=colors[i]) plt.triplot(centers[1][i][0], centers[1][i][1], linewidth=0.1) plt.scatter(centers[0][0], centers[0][1], marker='*', s=300) plt.show()