def __init__(self, n_clusters=40): self.n_clusters = n_clusters self.kmeans_obj = KMeans(n_clusters=n_clusters) self.kmeans_ret = None self.descriptor_vstack = None self.mega_histogram = None self.clf = GaussianNB()
def bestKElbow(X, options): K = 2 finish = 10 kmeans_list = [] fits = [] porcentage = [] tolerance = 10 # 10% de tolerancia finished = False best_k = 0 # kmeans con k=2 kmeans = km.KMeans(X, K, options) kmeans_list.append(kmeans) fits.append(kmeans.fitting()) K += 1 # kmeans con k=3 kmeans = km.KMeans(X, K, options) kmeans_list.append(kmeans) fits.append(kmeans.fitting()) K += 1 # Guardamos la resta y le asignamos un porcentage de 100% porcentage.append((fits[0] - fits[1], 100)) while K <= finish and not finished: best_k = 0 kmeans = km.KMeans(X, K, options) kmeans_list.append(kmeans) fits.append(kmeans.fitting()) K += 1 # restamos el penultimo con el antepenultimo (ultimos 2 kmeans calculados) resta = fits[-2] - fits[-1] # calculamos el porcentage respecto a la primera resta first_resta = porcentage[0][0] if first_resta == 0: first_resta = 0.000000000001 porcentage.append((resta, resta * 100 / first_resta)) # si las 2 ultimos porcentajes estan por debajo de la tolerancia cogemos la k anterior a estos 2 if (porcentage[-2][1] < tolerance and porcentage[-1][1] < tolerance): # print "por telerancia ssale" finished = True # le kito por el valor k empieza la k y las 2 k me paso best_k = K - 2 - 2 - 1 # Si no se ha cumplido el factor de tolerancia buscamos el % mas grande de caida y elegimos su segunda K if best_k == 0: max_porcentage = porcentage[1][1] porcentage_pos = 1 for pos, x in enumerate(porcentage[2:]): if x[1] > max_porcentage: max_porcentage = x[1] porcentage_pos = pos best_k = porcentage_pos return kmeans_list[best_k]
def evaluate_adjusted_rand_score(data, threshold, random): full_data = Parser.parse_data(only_answers=False) labels_true = Parser.get_true_party_assignment(full_data) number_of_parties = 11 assignment = KMeans.kmeans(data, number_of_parties, threshold, random) labels_pred = KMeans.get_centroid_labels(data, assignment) return adjusted_rand_score(labels_true, labels_pred)
def testGapStatistic(): count = 10; # X = np.array(np.loadtxt("data/kmeans1.txt", delimiter = "\t")); # X = np.mat(np.load("/media/WindowsE/Data/PARS/JNLH/YuanLiaoDaiShui/trainSet/train_all_75.npy"))[:, 68];#57 # X = np.mat(np.load("/media/WindowsD/WorkSpace/data.npy"))[:, 0]; X = np.load("/media/WindowsE/Data/PARS/JNLH/ReasonAnalysis/30/2020-08-01/data.npy")[:, 1].reshape(-1, 1); plt.figure(1, (12, 8)); plt.get_current_fig_manager().window.showMaximized(); plt.hist(X, bins = 1000); plt.show(block=True); selector1 = KMeans.GapStatistic(20, True); selector2 = KMeans.ElbowMethod(True); index, distance, center = KMeans.KMeans.optimalK(X, 5, count, KMeans.CombinationOptimalKSelector([selector1, selector2])); # plt.figure(1, (12, 8)); # plt.get_current_fig_manager().window.showMaximized(); # # colors = ["g", "b", "y", "k", "r", "m", "c"]; # markers = ["*", "+", "D", "s", "h", "v", "d"]; # # for i in range(0, len(center)): # plt.scatter(X[index == i, 0], X[index == i, 1], color = colors[i], marker = markers[i]); # plt.scatter(center[i, 0], center[i, 1], color = ["r"], marker = "x"); # # plt.show(block=True); plt.figure(1, (12, 8)); plt.get_current_fig_manager().window.showMaximized(); plt.hist(X, bins=1000); for i in range(len(center)): plt.axvline(center[i, 0], color = "r"); plt.show(block=True);
def processImage(im, options): """@brief Finds the colors present on the input image @param im LIST input image @param options DICTIONARY dictionary with options @return colors LIST colors of centroids of kmeans object @return indexes LIST indexes of centroids with the same label @return kmeans KMeans object of the class KMeans """ ######################################################### ## YOU MUST ADAPT THE CODE IN THIS FUNCTIONS TO: ## 1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS ## 2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER ## 3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE ######################################################### ## 1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS if options['colorspace'].lower() == 'ColorNaming'.lower(): im = cn.ImColorNamingTSELabDescriptor(im) elif options['colorspace'].lower() == 'RGB'.lower(): pass # im = np.reshape(im, (-1, im.shape[2])) elif options['colorspace'].lower() == 'Lab'.lower(): im = color.rgb2lab(im) elif options['colorspace'].lower() == 'HSV'.lower(): im = color.rgb2hsv(im) ## 2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER if options['K'] < 2: kmeans = km.KMeans(im, 0, options) kmeans.bestK() else: kmeans = km.KMeans(im, options['K'], options) kmeans.run() ## 3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE if options['colorspace'].lower() == 'RGB'.lower(): kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) elif options['colorspace'].lower() == 'Lab'.lower(): kmeans.centroids = kmeans.centroids[:, newaxis, :] kmeans.centroids = color.lab2rgb(kmeans.centroids) * 255.0 kmeans.centroids = np.reshape( kmeans.centroids, (kmeans.centroids.shape[0], kmeans.centroids.shape[2])) kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) elif options['colorspace'].lower() == 'HSV'.lower(): kmeans.centroids = kmeans.centroids[:, newaxis, :] kmeans.centroids = color.hsv2rgb(kmeans.centroids) kmeans.centroids = np.reshape( kmeans.centroids, (kmeans.centroids.shape[0], kmeans.centroids.shape[2])) kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) colors, which = getLabels(kmeans, options) return colors, which, kmeans
def KMeans(self, X): kmeans = KMeans(n_clusters=2, random_state=0).fit(X) kmeans.labels_ kmeans.predict([[0, 0], [4, 4]]) kmeans.cluster_centers_ return kmeans
def test_bikmeans(self): data_mat = mat(KMeans.loadDataSet("testSet2.txt")) k = 3 centList, clusterAssment = KMeans.biKmeans(data_mat, k) print("\n centList == %s" % (centList)) print("\n clusterAssment == %s" % (clusterAssment)) # plot fig = plt.figure() ax = fig.add_subplot(111) # 使用local()函数动态定义变量 for i in arange(k): locals()['xCluster' + str(i)] = [] locals()['yCluster' + str(i)] = [] for i in range(shape(clusterAssment)[0]): locals()['xCluster' + str(int(clusterAssment[i, 0]))].append( data_mat[i, 0]) locals()['yCluster' + str(int(clusterAssment[i, 0]))].append( data_mat[i, 1]) print("\n xCluster0 == %s" % (locals()['xCluster' + str(0)])) print("\n yCluster0 == %s" % (locals()['yCluster' + str(0)])) for i in arange(k): ax.scatter(locals()['xCluster' + str(i)], locals()['yCluster' + str(i)], s=30, c='orange', marker='s') # for i in range(shape(clusterAssment)[0]): # if clusterAssment[i, 0] == 0: # xCluster0.append(data_mat[i, 0]) # yCluster0.append(data_mat[i, 1]) # elif clusterAssment[i, 0] == 1: # xCluster1.append(data_mat[i, 0]) # yCluster1.append(data_mat[i, 1]) # elif clusterAssment[i, 0] == 2: # xCluster2.append(data_mat[i, 0]) # yCluster2.append(data_mat[i, 1]) # elif clusterAssment[i, 0] == 3: # xCluster3.append(data_mat[i, 0]) # yCluster3.append(data_mat[i, 1]) # ax.scatter(xCluster0, yCluster0, s=30, c='orange', marker='s') # ax.scatter(xCluster1, yCluster1, s=30, c='red', marker='p') # ax.scatter(xCluster2, yCluster2, s=30, c='blue', marker='*') # ax.scatter(xCluster3, yCluster3, s=30, c='black', marker='d') # 绘制质心点 xcord2 = centList[:, 0].A ycord2 = centList[:, 1].A ax.scatter(xcord2, ycord2, s=100, c='red', marker='+') plt.show()
def processImage(im, options): """@brief Finds the colors present on the input image @param im LIST input image @param options DICTIONARY dictionary with options @return colors LIST colors of centroids of kmeans object @return indexes LIST indexes of centroids with the same label @return kmeans KMeans object of the class KMeans """ ######################################################### ## YOU MUST ADAPT THE CODE IN THIS FUNCTIONS TO: ## 1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS ## 2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER ## 3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE ######################################################### ## 1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS if options['colorspace'] == 'ColorNaming': im = cn.ImColorNamingTSELabDescriptor(im) elif options['colorspace'] == 'Lab': im = color.rgb2lab(im) #No fa falta RGB per que no s'ha de transformar res img = rescale(im, 1, preserve_range=True) #Reescalat per que si no explota img = np.reshape(img, (-1, img.shape[2])) ## 2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER if options['K'] < 2: # find the best K kmeans = km.KMeans(im, 0, options) kmeans.bestK() else: kmeans = km.KMeans(img, options['K'], options) kmeans.run() ## 3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE if options['colorspace'] != 'ColorNaming': kmeans.centroids = np.reshape( kmeans.centroids, (-1, 1, kmeans.centroids.shape[1] )) #Un altre reshape per que si no falla el test amb Lab o RGB if options['colorspace'] == 'Lab': kmeans.centroids = color.lab2rgb(kmeans.centroids) * 255 #El extra kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) kmeans.centroids = np.reshape(kmeans.centroids, (-1, kmeans.centroids.shape[2])) ######################################################### ## THE FOLLOWING 2 END LINES SHOULD BE KEPT UNMODIFIED ######################################################### colors, which = getLabels(kmeans, options) return colors, which, kmeans
def processImage(im, options): """@brief Finds the colors present on the input image @param im LIST input image @param options DICTIONARY dictionary with options @return colors LIST colors of centroids of kmeans object @return indexes LIST indexes of centroids with the same label @return kmeans KMeans object of the class KMeans """ ######################################################### ## YOU MUST ADAPT THE CODE IN THIS FUNCTIONS TO: ## 1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS ## 2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER ## 3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE ######################################################### #im = im.astype('uint8') ## 1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS if options['colorspace'].lower() == 'ColorNaming'.lower(): im = cn.ImColorNamingTSELabDescriptor(im) elif options['colorspace'].lower() == 'RGB'.lower(): pass #im = color.convert_colorspace(im, 'RGB', options['colorspace']) elif options['colorspace'].lower() == 'Lab'.lower(): im = im.astype('float64') im = color.rgb2lab(im / 255) elif options['colorspace'].lower() == 'HSV'.lower(): im = color.rgb2hsv(im.astype('uint8')) ## 2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER if options['K'] < 2: # find the best K kmeans = km.KMeans(im, 0, options) kmeans.bestK() else: kmeans = km.KMeans(im, options['K'], options) kmeans.run() ## 3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE if options['colorspace'].lower() == 'RGB'.lower(): kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) elif options['colorspace'].lower() == 'Lab'.lower(): kmeans.centroids = color.lab2rgb([kmeans.centroids])[0] * 255 kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) elif options['colorspace'].lower() == 'HSV'.lower(): kmeans.centroids = color.hsv2rgb([kmeans.centroids])[0] kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) ######################################################### ## THE FOLLOWING 2 END LINES SHOULD BE KEPT UNMODIFIED ######################################################### colors, which = getLabels(kmeans, options) return colors, which, kmeans
def run_test(fileName, max_k): cache_dir = './cache' D = 2. T = 3. L = 1. host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write( 'This directory holds a cache of reconciliation graph for the TreeLife data set' ) f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) if not os.path.isfile(cache_location): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') f.write(repr(DictGraph)) f.close() print >> sys.stderr, 'Loading reonciliation graph from cache' f = open(cache_location) DictGraph = eval(f.read()) f.close() scoresList, dictReps = Greedy.Greedy(DictGraph, paras) print >> sys.stderr, 'Found cluster representatives using point-collecting' graph = ReconGraph.ReconGraph(DictGraph) setReps = [ ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps ] random.seed(0) extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)] representatives = setReps + extra_reps print >> sys.stderr, 'Starting K Means algorithm ... ' print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step' for i in xrange(1, max_k + 1): print 'k = %d' % i KMeans.k_means(graph, 10, i, 0, representatives[:i])
def processImage(im, options): """@brief Finds the colors present on the input image @param im LIST input image @param options DICTIONARY dictionary with options @return colors LIST colors of centroids of kmeans object @return indexes LIST indexes of centroids with the same label @return kmeans KMeans object of the class KMeans """ """ Aquesta és la funció principal en la que es basa tot el projecte. Aqui s'ha de transformar la imatge donada a un espai de tres dimensions per tal que es pugui treballar amb el nostre algorisme K-Means. Després es troben les etiquetes de color a partir dels centroids i es determina el percentatge d'èxit que ha tingut la nostra execució. """ ## 1- Canviem l'espai de color donat per a que funcioni amb el nostre algorisme. if options['colorspace'].lower() == 'ColorNaming'.lower(): im = cn.ImColorNamingTSELabDescriptor(im) elif options['colorspace'].lower() == 'RGB'.lower(): pass elif options['colorspace'].lower() == 'Lab'.lower(): im = color.rgb2lab(im) ## 2- Analitzem el paràmetre K per determinar si hem o no d'executar el bestK o no. ## Com que nosaltres fem la crida si i només si K=0, en cas que sigui K=1, executem bestK. if options['K'] < 2: kmeans = km.KMeans(im, 0, options) else: kmeans = km.KMeans(im, options['K'], options) kmeans.run() ## 3- Obtenim les etiquetes dels colors de la nostra imatge. if options['colorspace'].lower() == 'Lab'.lower(): kmeans.centroids = color.lab2rgb([kmeans.centroids])[0] * 255 kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) elif options['colorspace'].lower() == 'RGB'.lower(): kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) ######################################################### ## THE FOLLOWING 2 END LINES SHOULD BE KEPT UNMODIFIED ######################################################### colors, which = getLabels(kmeans, options) return colors, which, kmeans
def __init__(self, n_cluster, data, use_kmeans=True, w=0.5, c1=0.8, c2=0.6): index = np.random.choice(list(range(len(data))), n_cluster) self.centroids = data[index].copy() if use_kmeans: kmeans = KMeans(n_cluster=n_cluster, init_pp=False) kmeans.fit(data) self.centroids = kmeans.centroid.copy() self.best_position = self.centroids.copy() self.best_score = quantization_error(self.centroids, self._predict(data), data) self.best_sse = calc_sse(self.centroids, self._predict(data), data) self.velocity = np.zeros_like(self.centroids) self._w = w self._c1 = c1 self._c2 = c2
def test_rand_cent1(self): data_mat = mat(KMeans.loadDataSet("testSet.txt")) min0 = min(data_mat[:, 0]) max0 = max(data_mat[:, 0]) print("\n min0 == %s max0 == %s" % (min0, max0)) min1 = min(data_mat[:, 1]) max1 = max(data_mat[:, 1]) print("\n min1 == %s max1 == %s" % (min1, max1)) cent = KMeans.randCent(data_mat, 2) print("\n cent == %s" % (cent)) dist_eclud = KMeans.distEclud(data_mat[0], data_mat[1]) print("\n dist_eclud == %s" % (dist_eclud))
def processImage(im, options): """@brief Finds the colors present on the input image @param im LIST input image @param options DICTIONARY dictionary with options @return colors LIST colors of centroids of kmeans object @return indexes LIST indexes of centroids with the same label @return kmeans KMeans object of the class KMeans """ ######################################################### ## YOU MUST ADAPT THE CODE IN THIS FUNCTIONS TO: ## 1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS ## 2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER ## 3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE ######################################################### ## 1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS if options['colorspace'].lower() == 'ColorNaming'.lower(): im = cn.ImColorNamingTSELabDescriptor(im) elif options['colorspace'].lower() == 'RGB'.lower(): pass #Ya estamos en RGB elif options['colorspace'].lower() == 'Lab'.lower(): im = color.rgb2lab(im) im = np.array(im).reshape((-1,3)) ## 2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER if options['K']<2: # find the bes K kmeans = km.KMeans(im, 0, options) kmeans.bestK() #bestK segons fitting else: kmeans = km.KMeans(im, options['K'], options) kmeans.run() ## 3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE if options['colorspace'].lower() == 'RGB'.lower(): kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids) elif options['colorspace'].lower() == 'Lab'.lower(): pass ######################################################### ## THE FOLLOWING 2 END LINES SHOULD BE KEPT UNMODIFIED ######################################################### colors, which = getLabels(kmeans, options) return colors, which, kmeans
def test1(self): print "TEST 1:----------------------------------------------------------------" features = np.array([[1.9, 2.3], [1.5, 2.5], [0.8, 0.6], [0.4, 1.8], [0.1, 0.1], [0.2, 1.8], [2.0, 0.5], [0.3, 1.5], [1.0, 1.0]]) whitened = whiten(features) book = np.array((whitened[0], whitened[2])) numpy_result = kmeans(whitened, book)[0] print numpy_result print "" features2 = np.array([[1.9, 2.3,0], [1.5, 2.5,0], [0.8, 0.6,0], [0.4, 1.8,0], [0.1, 0.1,0], [0.2, 1.8,0], [2.0, 0.5,0], [0.3, 1.5,0], [1.0, 1.0,0]]) whitened2 = whiten(features2) book2 = [whitened[0], whitened[2]] our_result = np.array(KMeans.k_means2(whitened2.tolist(), 2, book2).centroids)[:, :-1] print our_result
def fit(self, data): # step1 construct weight matrix for every point #weight = kneighbors_graph(data, n_neighbors = self.n_neighbors_,mode='distance', include_self = False) weight = kneighbors_graph(data, n_neighbors = self.n_neighbors_,mode='connectivity', include_self = False) weight = 0.5 * (weight + weight.T) self.weight_ = weight.toarray() self.degree_ = np.diag(np.sum(self.weight_, axis = 0).ravel()) # step2 construct Laplacian matrix for every point, and normalize self.laplacians_ = self.degree_ - self.weight_ #unit_arrary = np.ones([data.shape[0],data.shape[0]],dtype=np.float64) #with np.errstate(divide='ignore'): # degree_nor = unit_arrary/np.sqrt(self.degree_) # degree_nor[self.degree_ == 0] = 0 degree_nor=np.sqrt(np.linalg.inv(self.degree_)) self.laplacians_ = np.dot(degree_nor, self.laplacians_) self.laplacians_ = np.dot(self.laplacians_, degree_nor)#normalize #step3 compute minimun k eigenvalues corresponding to eigenvectors and normalize eigen_values, eigen_vector = np.linalg.eigh(self.laplacians_) sort_index = eigen_values.argsort() eigen_vector = eigen_vector[:,sort_index] self.eigen_vector_ = np.asarray([eigen_vector[:,i] for i in range(self.n_clusters_)]).T #self.eigen_vector_ /= np.sqrt(np.sum(self.eigen_vector_**2, axis = 1)).reshape(data.shape[0], 1 ) self.eigen_vector_ /= np.linalg.norm(self.eigen_vector_, axis=1).reshape(data.shape[0], 1 ) #step4 kmeans with eigenvectors spectral_kmeans = KMeans.K_Means(n_clusters=self.n_clusters_) spectral_kmeans.fit(self.eigen_vector_) spectral_label = spectral_kmeans.predict(self.eigen_vector_) self.label_ = spectral_label self.fitted = True
def clusterClubs(numClust=5): datList = [] for line in open('places.txt').readlines(): lineArr = line.split('\t') datList.append([float(lineArr[4]), float(lineArr[3])]) datMat = np.mat(datList) # 利用2-means聚类算法聚类 myCentroids, clustAssing = KMeans.biKmeans(datMat, numClust, distMeas=distSLC) fig = plt.figure() rect = [0.1, 0.1, 0.8, 0.8] scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '<'] axprops = dict(xticks=[], yticks=[]) ax0 = fig.add_axes(rect, label='ax0', **axprops) imgP = plt.imread('Portland.png') ax0.imshow(imgP) ax1 = fig.add_axes(rect, label='ax1', frameon=False) for i in range(numClust): ptsInCurrCluster = datMat[np.nonzero(clustAssing[:, 0].A == i)[0], :] markerStyle = scatterMarkers[i % len(scatterMarkers)] ax1.scatter(ptsInCurrCluster[:, 0].flatten().A[0], \ ptsInCurrCluster[:, 1].flatten().A[0], \ marker = markerStyle, s=90) for i in range(numClust): ax1.scatter(myCentroids[i].tolist()[0][0], myCentroids[i].tolist()[0][1], s=300, c='k', marker='+', alpha=.5) plt.show()
def processImage(im, options): """@brief Finds the colors present on the input image @param im LIST input image @param options DICTIONARY dictionary with options @return colors LIST colors of centroids of kmeans object @return indexes LIST indexes of centroids with the same label @return kmeans KMeans object of the class KMeans """ if options['colorspace'].lower() == 'ColorNaming'.lower(): im = cn.ImColorNamingTSELabDescriptor(im) elif options['colorspace'].lower() == 'RGB'.lower(): pass elif options['colorspace'].lower() == 'Lab'.lower(): im = im.astype('float64') im = color.rgb2lab(im / 255) kmeansAlgorithm = km.KMeans(im, options['K'], options) kmeansAlgorithm.run() if options['colorspace'].lower() == 'RGB'.lower(): kmeansAlgorithm.centroids = cn.ImColorNamingTSELabDescriptor( kmeansAlgorithm.centroids) elif options['colorspace'].lower() == 'Lab'.lower(): kmeansAlgorithm.centroids = color.lab2rgb([kmeansAlgorithm.centroids ])[0] * 255 kmeansAlgorithm.centroids = cn.ImColorNamingTSELabDescriptor( kmeansAlgorithm.centroids) colors_obt, which_obt = getLabels(kmeansAlgorithm, options) return colors_obt, which_obt, kmeansAlgorithm
def runExperimentsForKMeans(numClusters, numExperiments, useTFIDF=True): maxSim = 0 experiment = 0 for i in range(numExperiments): print "\nExperiment " + str(i) + " for " + str( numClusters) + " clusters" [clusterAssignment, similarity] = KMeans.findClusterAssignment(numClusters, useTFIDF) outputFileName = "output/KMeans" + str(numClusters) + "-" + str(i) KMeans.printDocumentClusters(clusterAssignment, outputFileName) if maxSim < similarity: maxSim = similarity experiment = i print "Max Similarity over " + str( numExperiments) + " Experiments: " + str(maxSim) return maxSim, experiment
def main(): try: graphyboi = GraphController() graphyboi.update() kmeansyboi = KMeans(graphyboi.coords) except: print("Nothing in coords")
def run_test(fileName, max_k): cache_dir = './cache' D = 2. T = 3. L = 1. host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write('This directory holds a cache of reconciliation graph for the TreeLife data set') f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) if not os.path.isfile(cache_location): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') f.write(repr(DictGraph)) f.close() print >> sys.stderr, 'Loading reonciliation graph from cache' f = open(cache_location) DictGraph = eval(f.read()) f.close() scoresList, dictReps = Greedy.Greedy(DictGraph, paras) print >> sys.stderr, 'Found cluster representatives using point-collecting' graph = ReconGraph.ReconGraph(DictGraph) setReps = [ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps] random.seed(0) extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)] representatives = setReps + extra_reps print >> sys.stderr, 'Starting K Means algorithm ... ' print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step' for i in xrange(1, max_k + 1): print 'k = %d' % i KMeans.k_means(graph, 10, i, 0, representatives[:i])
def test_kmeans(self): data_mat = mat(KMeans.loadDataSet("testSet.txt")) centroids, clusterAssment = KMeans.kMeans(data_mat, 4) print("\n centroids == \n %s" % (centroids)) print("\n clusterAssment == \n %s" % (clusterAssment)) # plot fig = plt.figure() ax = fig.add_subplot(111) xcord = data_mat[:, 0].A ycord = data_mat[:, 1].A xCluster0 = [] yCluster0 = [] xCluster1 = [] yCluster1 = [] xCluster2 = [] yCluster2 = [] xCluster3 = [] yCluster3 = [] for i in range(shape(clusterAssment)[0]): if clusterAssment[i, 0] == 0: xCluster0.append(data_mat[i, 0]) yCluster0.append(data_mat[i, 1]) elif clusterAssment[i, 0] == 1: xCluster1.append(data_mat[i, 0]) yCluster1.append(data_mat[i, 1]) elif clusterAssment[i, 0] == 2: xCluster2.append(data_mat[i, 0]) yCluster2.append(data_mat[i, 1]) elif clusterAssment[i, 0] == 3: xCluster3.append(data_mat[i, 0]) yCluster3.append(data_mat[i, 1]) ax.scatter(xCluster0, yCluster0, s=30, c='orange', marker='s') ax.scatter(xCluster1, yCluster1, s=30, c='red', marker='p') ax.scatter(xCluster2, yCluster2, s=30, c='blue', marker='*') ax.scatter(xCluster3, yCluster3, s=30, c='black', marker='d') # 绘制原始数据 # ax.scatter(xcord, ycord, s=10, c='orange', marker='s') # 绘制质心点 xcord2 = centroids[:, 0].A ycord2 = centroids[:, 1].A ax.scatter(xcord2, ycord2, s=100, c='red', marker='+') plt.show()
def Cluster(cluster_zone): print ("step 2: Building Dataset..." ) dataSet = [] for i in g_macro_node_list: temp = [] temp.append(float(i.x_coord)) temp.append(float(i.y_coord)) dataSet.append(temp) dataSet = mat(dataSet) print ("step 3: Clustering..." ) centroids, clusterAssment = KMeans.kmeans(dataSet, cluster_zone) print ("step 3: show the result...") KMeans.showCluster(dataSet, cluster_zone, centroids, clusterAssment) return centroids, clusterAssment, dataSet
def find(self, scores): scale = 100 data = np.mat(scores).T * scale indices, distances, center = KMeans( lambda X, k: np.mat([X.min(), X.mean(), X.max()]).T).clustering( data, 3, 1) print("anomaly score centers:{0}".format(center.T)) checkValue = center[2, 0] minCheckValue = self._minCheckValue * scale maxCheckValue = self._maxCheckValue * scale defaultThreshold = self._defaultThreshold * scale minValue = data[(indices == 2).A.flatten(), :].min(0)[0, 0] maxValue = data[(indices == 2).A.flatten(), :].max(0)[0, 0] if maxValue <= defaultThreshold: return defaultThreshold / scale if checkValue >= defaultThreshold: checkValue = (minValue + checkValue) / 2 elif checkValue <= minCheckValue: checkValue = (checkValue + maxValue) / 2 if checkValue < minCheckValue: checkValue = minCheckValue elif checkValue > maxCheckValue: checkValue = maxCheckValue print("threshold check value: {0}".format(checkValue)) i = None for j in range(0, data.shape[0]): if data[j, 0] >= checkValue and i is None: i = j if data[j, 0] < checkValue and i is not None: if j - i > CurvesThresholdFinder.MIN_SAMPLES_NUMBER * 2: x, y = self._fit(data[i:j, 0]) if self.__showPlot: plt.figure(1, (16, 10)) plt.plot(list(range(0, j - i)), data[i:j, 0].A.flatten().tolist(), color="b", marker="x") if x is not None and y is not None: plt.plot(x, y, color="r") plt.show() i = None print("threshold all values: {0}".format(self._values)) threshold = (np.mean(self._values) if len(self._values) > 0 else defaultThreshold) / scale print("threshold found: {0}".format(threshold)) self.__reset() return threshold
def main(): data_in = [] feed_id = [] print('start reading data') #read_json.read_json(config.path_in,data_in,config.stop_word_path,feed_id,config.data_lines) #read_json.test_alignment_py('../data/cpp/small.txt','../data/cpp/python_alignment_extraction1.txt','stop_words.utf8',True,True,5,data_in) #print('finish reading data') #term_id = [] id_url = [] #read_liulanqi_data.read_data(config.path_in, data_in, id_url,50) read_weishi_data.read_json(config.path_in, data_in, None, feed_id, config.data_lines, None, config.topk) if config.mode == 'Training': if config.model_name == 'Counter': model = Vectorizer.CounterVector(config.model_name) elif config.model_name == 'TfIdf': model = Vectorizer.TfIdfVector(config.model_name) print('finish initilizing model') elif config.model_name == 'FeatureHasher': model = Vectorizer.FeatureHasherVector(config.model_name, config.n_features) model.feature_transform(data_in) print(len(model.vectorizer.vocabulary_)) model.serilize_model() if config.algo_name == 'KMeans': algo_instance = KMeans.KMeansClustering(config.algo_name) print('start training model') algo_instance.fit(model.feature) algo_instance.serilize_model() algo_instance.output_cluster_info(data_in, model, feed_id) else: print('loading vectorizer') model = BaseModel.BaseModel(config.model_name) model.de_serilize_model() print('finish loading vector') if config.algo_name == 'KMeans': algo_instance = Algorithm.Base_Algorithm(config.algo_name) algo_instance.de_serilize_model() print('finish desirialization') features = model.transform(data_in) labels = algo_instance.predict(features) print(labels) #algo_instance.get_centroids() #algo_instance.output_cluster_info(data_in, model, feed_id) print('finish all')
def get_cluster(): cluster_list = dict() for k in range(3, 9): k_temp = KMeans.KMeans(k) for j in range(2010, 2013): print("Fitting, k=", k, "Year=", j) k_temp.fit(data2test[j]) s_key = str(k) + "_" + str(j) cluster_list[s_key] = k_temp.cluster return cluster_list
def ReDo(self, actions, eegs): self.db = [] if self.data != []: for r in range(len(self.data)): t = [] for c in range(len(self.data[r]) - 1): t.append(self.data[r][c]) self.db.append(t) self.k = KMeans(self.db, actions[0], actions[1], actions[2], actions[3]) self.viewButton.Enable()
def processImage(im, options): # NO TOCAR im = rescale(im, 0.25, preserve_range=True) if options['colorspace'] == 'ColorNaming': # NO TOCAR im = np.reshape(im, (-1, im.shape[2])) im = cn.ImColorNamingTSELabDescriptor(im) elif options['colorspace'] == 'RGB': # NO TOCAR im = np.reshape(im, (-1, im.shape[2])) elif options['colorspace'] == 'Lab': # NO TOCAR im = cn.RGB2Lab(im) im = np.reshape(im, (-1, im.shape[2])) if options['K'] < 2: # trobar la millor K i = 2 fitting = [] for i in range(2, 15): kmeans = km.KMeans(im, i, options) fitting.append(kmeans.fitting()) print "k optima = ", fitting.index(min(fitting)) + 3 kmeans = km.KMeans(im, fitting.index(min(fitting)) + 3, options) else: kmeans = km.KMeans(im, options['K'], options) # NO TOCAR if options['colorspace'] == 'RGB': kmeans.centroids = cn.ImColorNamingTSELabDescriptor( kmeans.centroids) #centroides a cn elif options['colorspace'] == 'Lab': kmeans.centroids = np.reshape(kmeans.centroids, (-1, 1, kmeans.centroids.shape[1])) kmeans.centroids = color.lab2rgb(kmeans.centroids) * 255 kmeans.centroids = cn.ImColorNamingTSELabDescriptor( kmeans.centroids) #centroides a cn kmeans.centroids = np.reshape(kmeans.centroids, (-1, kmeans.centroids.shape[2])) #obtenir una representacio dels kmeans.centroids en funcio del 11 colors basics #normalitzar: obtenir una representacio dels kmeans.centroids que sumi 1 per fila colors, which = getLabels(kmeans, options) # NO TOCAR return colors, which, kmeans # NO TOCAR
def post(self): year = self.get_argument("year") taskId = self.get_argument("taskId") self.db = MysqlDriver( DATABASE.host, DATABASE.username, DATABASE.password, DATABASE.dbname) # connect to MySql database server dataset = self.db.getFeatures() km = KMeans(dataset, 50, 3, year, taskId) #initiatating ML algo MLthread = Thread(target=km.main, args=()) MLthread.daemon = True MLthread.start() self.write("Request created") #send the response of requested created self.finish()
def __initialization(self): """ Initialization all parameters by k-means. Returns: Initial mu, sigma, gamma """ km = KMeans.KMeans(self.k) labels = km.fit_predict(self.x, 50) mu = np.array( [np.average(self.x[labels == i], axis=0) for i in range(self.k)]) sigma = np.array([np.eye(self.dimension) + 1 for i in range(self.k)]) gamma = np.array([[-math.log(self.k, math.e)] * self.k] * self.x.shape[0]) return mu, sigma, gamma
def fit(self, data): # 作业3 # 屏蔽开始 # step1: initial the attribute of gmm by kmeans k_means = KMeans.K_Means(self.n_clusters_) k_means.fit(data) self.mu_ = np.asarray(k_means.centers_) print(self.n_clusters_) self.prior_ = np.asarray([1 / self.n_clusters_] * self.n_clusters_).reshape( self.n_clusters_, 1) self.posteriori_ = np.zeros((self.n_clusters_, len(data))) self.cov_ = np.asarray([eye(2, 2)] * self.n_clusters_) # step2:iteration Likelihood_value_before = -inf for i in range(self.max_iter_): # step3: E-step generate probability density distribution for every point and normalize print("gmm iterator:", i) for k in range(self.n_clusters_): self.posteriori_[k] = multivariate_normal.pdf(x=data, mean=self.mu_[k], cov=self.cov_[k]) self.posteriori_ = np.dot(diag(self.prior_.ravel()), self.posteriori_) self.posteriori_ /= np.sum(self.posteriori_, axis=0) #posteriori=np.asarray(self.posteriori_) #print(posteriori.shape) # step4: M-step update the parameters of generate probability density distribution for every point int E-step and stop when reached threshold self.Nk_ = np.sum(self.posteriori_, axis=1) self.mu_ = np.asarray([ np.dot(self.posteriori_[k], data) / self.Nk_[k] for k in range(self.n_clusters_) ]) self.cov_ = np.asarray([ np.dot((data - self.mu_[k]).T, np.dot(np.diag(self.posteriori_[k].ravel()), data - self.mu_[k])) / self.Nk_[k] for k in range(self.n_clusters_) ]) self.prior_ = np.asarray([self.Nk_ / self.n_clusters_ ]).reshape(self.n_clusters_, 1) Likelihood_value_after = np.sum(np.log(self.posteriori_)) print(Likelihood_value_after - Likelihood_value_before) if np.abs(Likelihood_value_after - Likelihood_value_before ) < self.tolerance_ * self.n_clusters_: break Likelihood_value_before = np.copy(Likelihood_value_after) self.fitted = True
def main(): data_in = [] feed_id = [] print('start reading data') path = 'E:\\QQ_Browser_data\\ruyizhuan.csv' path2 = 'E:\\QQ_Browser_data\\yanxigonglue.csv' tv_show.process_data(path, feed_id, data_in) tv_show.process_data(path2, feed_id, data_in) if config.mode == 'Training': if config.model_name == 'Counter': model = Vectorizer.CounterVector(config.model_name) elif config.model_name == 'TfIdf': model = Vectorizer.TfIdfVector(config.model_name) print('finish initilizing model') elif config.model_name == 'FeatureHasher': model = Vectorizer.FeatureHasherVector(config.model_name, config.n_features) model.feature_transform(data_in) print(len(model.vectorizer.vocabulary_)) if config.algo_name == 'KMeans': algo_instance = KMeans.KMeansClustering(config.algo_name) print('start training model') algo_instance.fit(model.feature) algo_instance.serilize_model() print('finish serilizing model') algo_instance.output_cluster_info(data_in, model, feed_id) else: print('loading vectorizer') model = BaseModel.BaseModel(config.model_name) model.de_serilize_model() print('finish loading vector') if config.algo_name == 'KMeans': algo_instance = Algorithm.Base_Algorithm(config.algo_name) algo_instance.de_serilize_model() print('finish desirialization') features = model.transform(data_in) labels = algo_instance.predict(features) print(labels) #algo_instance.get_centroids() #algo_instance.output_cluster_info(data_in, model, feed_id) print('finish all')
def test2(self): print "TEST 2:----------------------------------------------------------------" rand.seed(777) sampler = swr.SampleWithoutReplacement('datasets/adjusted-abalone.csv', .10) sampler.z_scale() training_set = sampler.get_training_set() test_set = sampler.get_test_set() indices_selected = list() centroids = [None]*4 for i in range(4): while True: index_selected = np.random.randint(0, len(training_set)) if index_selected not in indices_selected: centroids[i] = training_set[index_selected] indices_selected.append(index_selected) break numpy_result=kmeans(np.array(training_set)[:, :-1], np.array(centroids)[:, :-1])[0] our_result=np.array(KMeans.k_means2(training_set, 4, centroids).centroids)[:, :-1] print numpy_result print "" print our_result
def assign_to_cluster(input_set, centroids): """ Assigns every observation in the input_set to a cluster. Clusters are centered around the given centroids :param input_set: List of observations from the test set to put into a cluster. Has format [list of [observations]] :param centroids: List of centroids from running K-means on a training set. Has format [list of centroids] :return: The clusters that test set observations have been assigned to. Has format [list of [clusters of[observations of n features]]] """ if len(centroids) < 1: raise Exception('No centroids were given.') if len(input_set) < 1: raise Exception('No input observations were given.') # cluster_set = [[] for a in range(len(centroids))] cluster_indices = [[] for a in range(len(centroids))] for i in range(len(input_set)): min_dist = sys.maxint min_index = sys.maxint for j in range(len(centroids)): curr_dist = KMeans.euclidean_distance(input_set[i], centroids[j]) if curr_dist < min_dist: min_dist = curr_dist min_index = j # if input_set[i] not in cluster_set[min_index]: # cluster_set[min_index].append(input_set[i]) cluster_indices[min_index].append(i) # assignment = collections.namedtuple("clusterAssigment", ['clusters', 'indices']) # return assignment(cluster_set, cluster_indices) return cluster_indices
def run_test(fileName, max_k): cache_dir = './cache' D = 2. T = 3. L = 1. print >> sys.stderr, "FILE: ", fileName print fileName host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write('This directory holds a cache of reconciliation graph for the TreeLife data set') f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1]) if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') g = open(recon_count_location, 'w+') f.write(repr(DictGraph)) g.write(str(numRecon)) f.close() g.close() print >> sys.stderr, 'Loading reonciliation graph from cache' f = open(cache_location) g = open(recon_count_location) DictGraph = eval(f.read()) numRecon = float(g.read()) f.close() g.close() ## Only consider running algorithm for reconciliations with more than # threshold MPRs if (numRecon < recon_threshold): print >> sys.stderr, 'Too few reconciliations: ', numRecon return else: print >> sys.stderr, 'Reconciliation Count: ', numRecon scoresList, dictReps = Greedy.Greedy(DictGraph, paras) print >> sys.stderr, 'Found cluster representatives using point-collecting' graph = ReconGraph.ReconGraph(DictGraph) setReps = [ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps] random.seed(0) extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)] representatives = setReps + extra_reps print >> sys.stderr, 'Starting K Means algorithm ... ' print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step' for seed in xrange(5): for i in xrange(1, max_k + 1): # print 'k = %d' % i # KMeans.k_means(graph, 10, i, 0, representatives[:i]) KMeans.k_means(graph, 10, i, seed, None) print
def testReadFilePoints(self): points = KMeans.dataset_to_list_points(DATASET) self.assertTrue(len(points) > 0) self.assertTrue(points[0].dimension == 2)
def testGetNearestCluster(self): self.assertEquals(KMeans.get_nearest_cluster( [cluster, Cluster([Point(np.array([8, 8]))])], point), 0)
print "Init - Create the first cluster points and plot them..." # TODO here is where you change the number of centroids by adding or removing the points. # The numbers represent the starting points of each centroid with the following coordinate pair: (x, y) clusterPoints = [Point(2, 3), Point(35, 20), Point(40, 40), Point(60, 60), Point(30, 30)] centroids = getCentroids(clusterPoints) # just convert the points to centroids for plotting and labeling assistance... plotter.plotCentroids(centroids) print "Init complete..." raw_input('Press enter to continue and to start the algorithm.') # Run the algorith 10 times # TODO So right now we are running the algorithm 10 times. Maybe we should come up with some better meassurement? for x in xrange(1,10): # Get lables print "Create the lables, this should take some time...." # The interesting part is what is going on in the classify method. labels = kmeans.classify(trainingX, trainingY, centroids) # Plot the labled data print "Plot the labled data." plotter.clear() plotter.plotCentroids(centroids) plotter.plotLabledData(trainingX, trainingY, labels, centroids) raw_input('Press enter to continue') # Recalculated the centroids and unlable the data so to say... print "Plot the new centroids." plotter.clear() plotter.plotUnlabledData(trainingX, trainingY) centroids = kmeans.reCalculateCentroids(trainingX, trainingY, labels, centroids) plotter.plotCentroids(centroids) raw_input('Press enter to continue')
import KMeans import numpy ''' dataMat = mat(KMeans.loadDataSet("E:/TestDatas/MachineLearningInAction/Ch10/testSet.txt")) k = 4 centroids, clustAssing = KMeans.kMeans(dataMat, k) KMeans.showCluster(dataMat, k, centroids, clustAssing) ''' ''' dataMat = numpy.mat(KMeans.loadDataSet("E:/TestDatas/MachineLearningInAction/Ch10/testSet2.txt")) k = 3 centroids, clustAssing = KMeans.bitKmeans(dataMat, k) KMeans.showCluster(dataMat, k, centroids, clustAssing) ''' KMeans.clusterClubs("E:/TestDatas/MachineLearningInAction/Ch10/places.txt", "E:/TestDatas/MachineLearningInAction/Ch10/Portland.png")
def learnvocabulary(train_set, cluster_num, max_iter) : start = time.time() means = KMeans.mykmeanspp(train_set, cluster_num, max_iter, True) print("Kmeans Time: ",time.time()-start) return means
def run_test(fileName, max_k): cache_dir = './cache' D = 2. T = 3. L = 1. print >> sys.stderr, "FILE: ", fileName print fileName host, paras, phi = newickFormatReader.getInput(fileName) if not os.path.exists(cache_dir): os.makedirs(cache_dir) f = open('%s/README' % cache_dir, 'w') f.write('This directory holds a cache of reconciliation graph for the TreeLife data set') f.close() cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1]) recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1]) if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)): print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file' print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L) f = open(cache_location, 'w+') g = open(recon_count_location, 'w+') f.write(repr(DictGraph)) g.write(str(numRecon)) f.close() g.close() print >> sys.stderr, 'Loading reonciliation graph from cache' f = open(cache_location) g = open(recon_count_location) DictGraph = eval(f.read()) numRecon = float(g.read()) f.close() g.close() ## Only consider running algorithm for reconciliations with more than # threshold MPRs if (numRecon < recon_threshold): print >> sys.stderr, 'Too few reconciliations: ', numRecon return else: print >> sys.stderr, 'Reconciliation Count: ', numRecon scoresList, dictReps = Greedy.Greedy(DictGraph, paras) graph = ReconGraph.ReconGraph(DictGraph) representatives = [ReconGraph.dictRecToSetRec(graph, dictReps[0])] ## Debug info ## Modifies the graph ## Checking for the case when there is an error in likelihood print >> sys.stderr, "== Checking for likelihoods over 1 ==" found = False for key in DictGraph.keys(): children = DictGraph[key] for child in children[:-1]: if child[-1] > 1: # Attempt to round to fix large float math errors roundedValue = round(child[-1]) if roundedValue != 1.0: print >> sys.stderr, "ERR FOUND: ", key, child found = True if not(found): print >> sys.stderr, "NO ERR(s)" print >> sys.stderr, "== End of over 1 checks. ==" print >> sys.stderr, 'Starting K-centers algorithm ... ' for i in xrange(2, max_k + 2): d, newrep = maximize(graph,representatives) if not all(d_i > 0 for d_i in d): print >> sys.stderr, "Distance vector contains 0", d break print i-1, min(d), representatives.append(newrep) dist_sum = 0 n = 10 for _ in xrange(n): reps = [KMeans.get_weighted_template(graph) for _ in xrange(i-1)] dist_sum += min_d(maximize(graph,reps)) print float(dist_sum) / n print >> sys.stderr, "Finished k centers algorithm ..."
Summary: 1. If there are no seed, the random number is different in each run, which means KMeans will not be stable. The stable algorithm means the result is the same in each run, which means we can get the same centroids in each run. 2. When k is 1, all data points are in the same group, and the center is at (0,0) in each dimension. This is because I normalized dataset before I did kmeans. When k is small, there are no or less overlap between groups. However, when k is large, the overlap is serious, which means there may be more incorrect partition when there are more clusters. Note: In a normal case, the neighbor points of a centroid must be the same class. But as k becomes large, the adjacent zone of a centroid may include points with other class. ''' for k in range(1,11): means, clusters = KMeans.mykmean(X, k, max_iter) #print(means) data, target, data_clusters, target_clusters = data_recovery_more(clusters, X) show_figure_clusters(data_clusters, target_clusters, means) # end for ''' Section 2.2 Note: I choose the inital centroid from the run with minimum distortion Summary: 1. By running many times, we can get a more stable result. 2. By looking the distortion versus iteration figure, we can know the distortion will reduce to a stable value as iteration
test_set = sampler.get_test_set() # Tried to see if it made a difference if QR was performed on unscaled datasets rand.seed(777) sampler2 = swr.SampleWithoutReplacement('datasets/adjusted-abalone.csv', .10) unscaled_training_set = sampler2.get_training_set() unscaled_test_set = sampler2.get_test_set() global_wcss = list() global_rmse = list() # Run K-means on the data set and output results from it for i in [1, 2, 4, 8, 16]: # Run K-means on the training set and store the data results = KMeans.k_means(training_set, i) global_wcss.append(sum(results.wcss)) # Calculate the mean, sd, and weights of all clusters cluster_weights = [None]*i cluster_info = list() for j in range(len(results.clusters)): info = calculate_cluster_values(results, j, unscaled_training_set) cluster_info.append(info) cluster_weights[j]=list(info.weights) # assign all observations in the test set to clusters test_clusters = assign_to_cluster(test_set, results.centroids) # Now predict y for the test clusters using the weights from training clusters cluster_predictions = predict_categories(unscaled_test_set, test_clusters, cluster_weights)
# HomePage : # Email : ################################################# from numpy import * import time import matplotlib.pyplot as plt import KMeans ## step 1: load data print ("step 1: load data..." ) dataSet = [] #列表,用来表示,列表中的每个元素也是一个二维的列表;这个二维列表就是一个样本,样本中包含有我们的属性值和类别号。 #与我们所熟悉的矩阵类似,最终我们将获得N*2的矩阵,每行元素构成了我们的训练样本的属性值和类别号 fileIn = open("D:/xuepython/testSet.txt") #是正斜杠 for line in fileIn.readlines(): temp=[] lineArr = line.strip().split('\t') #line.strip()把末尾的'\n'去掉 temp.append(float(lineArr[0])) temp.append(float(lineArr[1])) dataSet.append(temp) #dataSet.append([float(lineArr[0]), float(lineArr[1])]) fileIn.close() ## step 2: clustering... print ("step 2: clustering..." ) dataSet = mat(dataSet) #mat()函数是Numpy中的库函数,将数组转化为矩阵 k = 4 centroids, clusterAssment = KMeans.kmeans(dataSet, k) #调用KMeans文件中定义的kmeans方法。 ## step 3: show the result print ("step 3: show the result..." ) KMeans.showCluster(dataSet, k, centroids, clusterAssment)
def strategy(self): kmeans = KMeans(self.file_name) hospitals = kmeans.k_means() sorted_hospitals = sorted(hospitals.keys()) k = 1 for hospital in sorted_hospitals: sys.stdout.write( "Hospital:" + str(hospital.id) + "|" + str(hospital.x) + "," + str(hospital.y) + "," + str(hospital.ambu) + "|" ) for i in range(hospital.ambu): if i != hospital.ambu - 1: sys.stdout.write(str(k) + ",") else: sys.stdout.write(str(k) + "\n") k += 1 ambulance_num = k - 1 k = 1 print for hospital in sorted_hospitals: patients = filter( lambda x: x.time > 2.3 * (abs(x.x - hospital.x) + abs(x.y - hospital.y)), hospitals[hospital] ) ambu_num = hospital.ambu ambulances = [] for i in range(ambu_num): ambulances.append(Ambulance(k, hospital)) k += 1 while True: pre_num = len(self.saved_patients) for i in range(4): for ambulance in ambulances: patients = sorted(patients, key=lambda x: abs(x.x - ambulance.x) + abs(x.y - ambulance.y)) for patient in patients: if self.__savable(patient, ambulance, hospital, i): ambulance.patients.append(patient) patients.remove(patient) ambulance.current_time += self.__distance(patient, ambulance) + 1 ambulance.x = patient.x ambulance.y = patient.y break for ambulance in ambulances: if len(ambulance.patients) == 0: continue sys.stdout.write( "Ambulance:" + str(ambulance.id) + "|" + str(hospital.x) + "," + str(hospital.y) + "|" ) ambulance.current_time += self.__distance(ambulance, hospital) + 1 first = True for patient in ambulance.patients: if patient.time >= ambulance.current_time: if first: sys.stdout.write( str(patient.id) + "," + str(patient.x) + "," + str(patient.y) + "," + str(patient.time) ) first = False else: sys.stdout.write( ";" + str(patient.id) + "," + str(patient.x) + "," + str(patient.y) + "," + str(patient.time) ) self.saved_patients.append(patient) sys.stdout.write("|" + str(hospital.x) + "," + str(hospital.y) + "\n") ambulance.patients = [] ambulance.x = hospital.x ambulance.y = hospital.y if len(self.saved_patients) == pre_num: break