def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignements are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed with warnings.catch_warnings(record=True) as w: k_means(X, n_clusters=n_clusters, init=centers) assert_equal(len(w), 1) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
def kmeans_analysis(G): block = nx.get_node_attributes(G,'block').values() xA, xL = get_embedding(G,2) cA,kmA,_ = k_means(xA,2) cB,kmL,_ = k_means(xL,2) # plt.subplot(221); plt.scatter(xA[:,0],xA[:,1],c=block) # plt.subplot(222); plt.scatter(xA[:,0],xA[:,1],c=kmA) # plt.subplot(223); plt.scatter(xL[:,0],xL[:,1],c=block) # plt.subplot(224); plt.scatter(xL[:,0],xL[:,1],c=kmL) ax = plt.subplot(121); plt.scatter(xA[:,0],xA[:,1],c=block,marker='x') ax.set_aspect('equal','datalim') lim = plt.axis() a = cA[0,:]-cA[1,:] a = np.array([1, -a[0]/a[1]]) b = np.mean(cA,axis=0) x = np.array([b+a,b-a]) plt.plot(x[:,0],x[:,1],'k--',linewidth=1) plt.axis(lim) ax = plt.subplot(122); plt.scatter(xL[:,0],xL[:,1],c=block,marker='x') ax.set_aspect('equal','datalim') lim = plt.axis() a = cB[0,:]-cB[1,:] a = np.array([1, -a[0]/a[1]]) b = np.mean(cB,axis=0) x = np.array([b+a,b-a]) plt.plot(x[:,0],x[:,1],'k--',linewidth=1) plt.axis(lim) compare_results(block,kmA,kmL) _,kmA,_ = k_means(xA,5) _,kmL,_ = k_means(xL,5) print "ALL FIVE" num_diff = vn.num_diff_w_perms(block, kmA) ari = adjusted_rand_score(block,kmA) print "Adjacency: num error="+repr(num_diff)+" ari="+repr(ari) num_diff = vn.num_diff_w_perms(block, kmL) ari = adjusted_rand_score(block,kmL) print "Laplacian: num error="+repr(num_diff)+" ari="+repr(ari)
def getCenteroidsByGapStats(dataToCluster, maxCluster): # plot data xminData = np.min(dataToCluster[:,0]) xmaxData = np.max(dataToCluster[:,0]) yminData = np.min(dataToCluster[:,1]) ymaxData = np.max(dataToCluster[:,1]) numOfClusterRuns = maxCluster - 1 sumSqMetricSave = np.zeros((1, numOfClusterRuns)) wksMetricSave = np.zeros((1, numOfClusterRuns)) wkbsMetricSave = np.zeros((1, numOfClusterRuns)) kMetricSave = np.zeros((1, numOfClusterRuns), dtype=np.int32) skMetricSave = np.zeros((1, numOfClusterRuns)) centeroidSave = [] labelSave = [] for clusterRun in xrange(1, maxCluster): centroids, labels, inertia = k_means(dataToCluster, n_clusters = clusterRun) centeroidSave.append(centroids) labelSave.append(labels) kMetricSave[0, clusterRun-1] = clusterRun # calculate gap stattistics for selecting the number of clusters tempVar = calculateWk(centroids, labels, dataToCluster) sumSqMetricSave[0, clusterRun-1] = tempVar wksMetricSave[0, clusterRun-1] = np.log(tempVar) # ref data set bRef = 10 BWkbs = np.zeros((1, bRef)) for iRun in xrange(bRef): refData = np.zeros_like(dataToCluster) for dataRun in xrange(dataToCluster.shape[0]): refData[dataRun,:] = np.array([np.random.uniform(xminData ,xmaxData), np.random.uniform(yminData, ymaxData)]) centroidsRef, labelsRef, inertiaRef = k_means(refData, n_clusters = clusterRun) BWkbs[0, iRun] = np.log(calculateWk(centroidsRef, labelsRef, refData)) wkbsMetricSave[0, clusterRun-1] = np.sum(BWkbs)/float(bRef) skMetricSave[0, clusterRun-1] = np.sqrt(np.sum((BWkbs - wkbsMetricSave[0, clusterRun-1])**2)/float(bRef)) skMetricSave = skMetricSave*np.sqrt(1 + 1/float(bRef)) # gap statistics gap = (wkbsMetricSave - wksMetricSave) gap= gap.reshape(1, -1) finalMetric = np.zeros((1, numOfClusterRuns)) for iRun in xrange(1, maxCluster-1): # gapofk-gapofk+1-sk finalMetric[0, iRun-1] = gap[0, iRun-1] - (gap[0, iRun] - skMetricSave[0, iRun]) indeNonZero = np.where(finalMetric>0)[1] selectIndex = np.min(indeNonZero) # final clustering pics selectCenteroids = np.array(centeroidSave[selectIndex]) selectLabels = np.array(labelSave[selectIndex]) return selectCenteroids
def test_faith(): """The 1st part of project3""" data_1 = xlrd.open_workbook(r'G:\pyproj\EE511Proj3\oldfaithful.xlsx') #Change the directory to file directory table = data_1.sheet_by_name(u'Sheet1') x = table.col_values(0) y = table.col_values(1) c = [] for i in range(0,len(x)): c.append(table.row_values(i)) print(c) [centroid,label,inertia] = cluster.k_means(c,2) print(centroid) print(label) print(inertia) for a in range(0,len(label)): plt.scatter(x[a],y[a],c = 'b') plt.xlabel('eruptions') plt.ylabel('waiting') plt.title('Raw Data') plt.show() for j in range(0,len(label)): if label[j] == 1: plt.scatter(x[j],y[j],marker = '.',c = 'r') elif label[j] == 0: plt.scatter(x[j],y[j],marker = '*',c = 'b') plt.xlabel('eruptions') plt.ylabel('waiting') plt.title('Clustering of Data') plt.show() plt.clf()
def check_cluster(cluster): n = len(cluster) if n < 2: return True, [] # Run k_means on two centers children, labels, _ = k_means(cluster, 2) # Let v = c1 - c2 be a d-dimensional vector that connects the two centers. This is the direction that k-means # believes to be important for clustering. v = children[1]-children[0] # Then project X onto v: x'i = hxi, vi/||v||2. X0 is a 1-dimensional # representation of the data projected onto v. x_prime = [np.dot(point, v) for point in cluster] # Transform X0 so that it has mean 0 and variance 1. x_prime = zscore(x_prime) # Let zi = F(x0(i)). If A2*(Z) is in the range of non-critical values at confidence level alpha, then accept H0, # keep the original center, and discard {c1, c2}. Otherwise, reject H0 and keep {c1, c2} in place of the original # center. a2, critical, sig = anderson(x_prime) a2 *= (1+4.0/n-25.0/(n**2)) return a2 < critical[0], children
def k_means_classifier(image): n_clusters = 8 # blur and take local maxima blur_image = gaussian(image, sigma=8) blur_image = ndi.maximum_filter(blur_image, size=3) # get texture features feats = local_binary_pattern(blur_image, P=40, R=5, method="uniform") feats_r = feats.reshape(-1, 1) # cluster the texture features km = k_means(n_clusters=n_clusters, batch_size=500) clus = km.fit(feats_r) # copy relevant attributes labels = clus.labels_ clusters = clus.cluster_centers_ # reshape label arrays labels = labels.reshape(blur_image.shape[0], blur_image.shape[1]) # segment shadow img = blur_image.ravel() shadow_seg = img.copy() for i in range(0, n_clusters): # set up array of pixel indices matching cluster mask = np.nonzero((labels.ravel() == i) == True)[0] if len(mask) > 0: thresh = threshold_otsu(img[mask]) shadow_seg[mask] = shadow_seg[mask] < thresh shadow_seg = shadow_seg.reshape(*image.shape) return shadow_seg
def cpie_mech_dy(turn): km = k_means(np.array([turn]).T, 2, random_state=1234)[0].flatten() km = np.sort(np.append(km, km.mean())) mech = np.zeros_like(turn) mech[((turn > km[0]) & (turn <= km[1]))] = 1 mech[(turn > km[2])] = 1 return mech
def feature_clust(f_pool, f_train, n_clust, method='unsupervised-spectral'): N_pool = len(f_pool) data_f_pool = list(f_pool) data_f_train = list(f_train) data_f_pool.extend(data_f_train) if method == 'unsupervised-ds-svm': labels = ds_svm_clustering(data_f_pool, n_clust=n_clust, eta=4, ds_ratio=0.25, plot=False, metric='euclidean') elif method == 'unsupervised-spectral': spectral = cl.SpectralClustering(n_clusters=n_clust, eigen_solver='arpack', affinity="nearest_neighbors", n_jobs=6) spectral.fit(data_f_pool) labels = spectral.labels_ elif method == 'unsupervised-kmeans': clusters = cl.k_means(data_f_pool, n_clust) #Kmeans Clustering labels = clusters[1] elif method == 'unsupervised-kmedoids': _, labels = k_medoids_selection(data_f_pool, n_clust) else: raise ValueError('Invalid clustering method!') clust_pool = labels[0:N_pool] clust_train = labels[N_pool:] return clust_pool, clust_train
def regularized_spectral_clustering(adj_matrix, tau, n_clusters, algo='scan'): """ :param adj_matrix: adjacency matrix representation of graph where [m][n] >0 if there is edge and [m][n] = weight :param n_clusters: cluster partitioning constant :param algo: the clustering separation algorithm, possible value kmeans++ or scan :return: labels, number of clustering iterations needed, smallest set of cluster found, execution time """ start = timer() regularized_laplacian = regularized_laplacian_matrix(adj_matrix, tau) eigen_values, eigen_vectors = eigen_solver(regularized_laplacian, n_clusters=n_clusters) if algo == 'kmeans++': _, labels, _, num_iterations = k_means(eigen_vectors, n_clusters=n_clusters, return_n_iter=True) else: if n_clusters == 2: # cluster based on sign second_eigen_vector_index = np.argsort(eigen_values)[1] second_eigen_vector = eigen_vectors.T[second_eigen_vector_index] labels = [0 if val <= 0 else 1 for val in second_eigen_vector] # use only the second eigenvector num_iterations = 1 else: # bisecting it into k-ways, use all eigenvectors labels = discretize(eigen_vectors) num_iterations = 20 # assume worst case scenario that it tooks 20 restarts end = timer() execution_time = end - start smallest_cluster_size = min(np.sum(labels), abs(np.sum(labels) - len(labels))) return labels, num_iterations, smallest_cluster_size, execution_time
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, sample_weight=None, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert centers.shape == (n_clusters, n_features) assert np.unique(labels).shape[0] == n_clusters # check that the labels assignment are perfect (up to a permutation) assert v_measure_score(true_labels, labels) == 1.0 assert inertia > 0.0 # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, sample_weight=None, init=centers)
def spectral_clustering(road_map=None, a=None, use_ncut=False, num_clusters=2): print("Building adjacency matrix") if(a==None): a = build_adjacency_matrix(road_map) print("Computing laplacian") l = build_laplacian(a, normalize=use_ncut) print("Spectral embedding") #e_vals, e_vects = eigsh(l, k=num_clusters, which='SM', tol=0.01, sigma=2.01) X = np.random.rand(l.shape[0], num_clusters+1) e_vals, e_vects = lobpcg(l, X, tol=1e-15, largest=False, maxiter=2000) embedded_data = e_vects[:,1:] print e_vals print("Clustering") centroid, label, intertia = k_means(embedded_data, num_clusters) for i in xrange(len(label)): road_map.nodes[i].region_id = label[i]
def clusters(n = 100): from sklearn.cluster import k_means import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap from matplotlib.patches import Polygon from matplotlib.collections import PatchCollection from matplotlib.patches import PathPatch from mapFuncts import USbasemap from matplotlib import cm, colors, colorbar fig = plt.figure() ax = fig.add_subplot(111) fig, ax, m = USbasemap(fig = fig, ax = ax) alldivDF, _, _, _, _ = seasonal_setup(season = 'MAM') centroid, label, inertia = k_means(alldivDF.T, n_clusters = n, init = 'k-means++') names = alldivDF.columns codes = reverseStates(importStates()) cmap = cm.Accent norm = colors.Normalize(vmin = 0, vmax = n, clip = True) mapper = cm.ScalarMappable(norm = norm, cmap = cmap) for cluster in range(n): idx = label==cluster patches = [] color = mapper.to_rgba(cluster) for name in names[idx]: divcode = codes[name[:-3]]+name[-2:] print name, divcode for info, shape in zip(m.divs_info, m.divs): if info['CLIMDIV']==int(divcode): patches.append(Polygon(np.array(shape),True)) ax.add_collection(PatchCollection(patches, facecolor = color, edgecolor='k', linewidths=1., zorder=2)) return fig, ax
def fit( self, data: List[Iterator[float]], find_n: bool = False ) -> Dict[str, Union[List[int], Union[float, None]]]: """Cluster the input data into n clusters. Args: data: A list of vectors. find_n: If True, don't use self.n_cluster but find n using elbow analysis instead Return: A list of integers as class labels. The order of the list corresponds to the order of the input data. """ if find_n: self.n_clusters = 5 # self._get_n() if self.clus_type == 'kmeans': self.cluster = k_means(n_clusters=self.n_clusters) elif self.clus_type == 'sphericalkmeans': self.cluster = SphericalKMeans(n_clusters=self.n_clusters) elif self.clus_type == 'agglomerative': self.cluster = AgglomerativeClustering(n_clusters=self.n_clusters, affinity=self.affinity, linkage=self.linkage) self.cluster.fit(data) self._calc_density() return {'labels': self.cluster.labels_, 'density': self.compactness}
def sweep_clusters(args): data = joblib.load(args.model_file) projected = data[PROJECTION_KEY] print "Model type", data[MODEL_TYPE_KEY] if not os.path.exists(args.figures_dir): os.makedirs(args.figures_dir) inertia_values = [] for k in args.n_clusters: print "Clustering with %s states" % k _, _, inertia = k_means(projected[:, args.dimensions], k, n_jobs=-2) inertia_values.append(inertia) plt.plot(args.n_clusters, inertia_values, "k.-") plt.xlabel("Number of Clusters", fontsize=16) plt.ylabel("Inertia", fontsize=16) fig_flname = os.path.join(args.figures_dir, "cluster_inertia") for dim in args.dimensions: fig_flname += "_%s" % dim fig_flname += ".png" plt.savefig(fig_flname, DPI=300)
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans'): if assign_labels not in ('kmeans', 'discretize', 'AgglomerativeClustering'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters) else: labels = discretize(maps, random_state=random_state) return labels
def clust(X,vectorfile,clust_file,clust_number): fid2fname = {} for line in open(vectorfile) : line = line.strip().split('\t') fid2fname.setdefault(int(line[0]), line[1:]) # Force the solver to be arpack, since amg is numerically # unstable on this example # labels = spectral_clustering(graph, n_clusters=160, eigen_solver='arpack') a, labels,inertia = cluster.k_means(X,n_clusters=clust_number) print ("inertia=",inertia) C=np.column_stack((X,labels)) easy_data=open(clust_file, 'w') for pnt1 in C : strx="" for charest in pnt1: strx+=str(int(charest))+"\t" print >> easy_data, strx easy_data.close() return inertia
def kmeans(data, depth): original = data data = data[data.PRES < depth] if data[data.DATA_MODE != "D"].shape[0] > 0 and data[data.DATA_MODE == "D"].shape[0] > 0: # Get mid-range of DMQC and RTQC data rm_d = (data[data.DATA_MODE == "D"].PSAL.max() + data[data.DATA_MODE == "D"].PSAL.min()) / 2 rm_r = (data[data.DATA_MODE != "D"].PSAL.max() + data[data.DATA_MODE != "D"].PSAL.min()) / 2 # K means algorithm init = array([[rm_d], [rm_r]]) centroid, labels, loss = k_means(data[["PSAL"]], 2, init=init, n_init=1) else: return (False, original, [], []) data.insert(data.shape[1], "LABEL", labels) # Get data from the second group gruped = data[data.LABEL == 1].groupby(["PLATFORM_NUMBER", "CYCLE_NUMBER" ]).size().reset_index() platform_numbers = gruped.PLATFORM_NUMBER.unique().tolist() temp_data = original[ (original.PLATFORM_NUMBER.isin(gruped.PLATFORM_NUMBER)) & (original.CYCLE_NUMBER.isin(gruped.CYCLE_NUMBER))] # If the second group has DMQC data, the flag is False flag = False if temp_data[temp_data.DATA_MODE == "D"].shape[0] > 0 else True # Get data from the first group temp_data = original[~( (original.PLATFORM_NUMBER.isin(gruped.PLATFORM_NUMBER)) & (original.CYCLE_NUMBER.isin(gruped.CYCLE_NUMBER)))] return (flag, original, temp_data, platform_numbers)
def kmeans_cluster(self, data, n_clusters, maximum): """ Calculate K-means and plot """ # Make clusters from sklearn.cluster import k_means import numpy as np init = np.array([[1.25, 0.28], [4.29, 0.71], [0.99, 1.15], [6.5, 1.011]]) centroids, labels, sse = k_means(data, n_clusters=n_clusters, init=init, n_init=100) # Plot from matplotlib import pyplot as plt plt.figure(figsize=(10, 7)) # Define color map #cmap = 'jet' cmap = self.color_map('kmeans') # De-normalize before plot plt.scatter(data[:, 0] * maximum, data[:, 1], c=labels, cmap=cmap) plt.title('Clusters - k-means') plt.xlabel('var0') plt.ylabel('var1') plt.savefig('clusters_kmeans.png') plt.close() return ()
def draw_kmeans(item): print 'dataset:', items[item] train_data = tools.data_pre(item) [centroid, label, inertia] = cluster.k_means(train_data, cluster_k) #sava data into the csv files root_path = root_dir + 'data\kmeans' + os.sep + items[item] + os.sep print 'result path:', root_path if not os.path.isdir(root_path): os.makedirs(root_path) time.sleep(2) label_path = root_path + items[item] + '_label.csv' count_path = root_path + items[item] + '_count.csv' cent_path = root_path + items[item] + '_cent.csv' np.savetxt(label_path, label, delimiter=',') pd.value_counts(label).T.to_csv(count_path) # value_counts()计算非空值计数的直方图。 df = pd.DataFrame(centroid) # T 转置; to_csv()将数据写入csv文件 df.to_csv(cent_path, float_format='%.5f') index = 1 for i in centroid: plt.figure(index) title_name = items[item] + '_kmeans_' + str(index) #标题名称 plt.title(title_name) pd.Series(i).plot() #画图 path = root_dir + '\data\kmeans\img' + items[item] + '_kmeans_' + str( index) + '.png' plt.savefig(path) #保存 index += 1
def sklearn_spectral_clustering(adj_matrix, n_clusters): """ :param adj_matrix: adjacency matrix representation of graph where [m][n] >0 if there is edge and [m][n] = weight :param n_clusters: cluster partitioning constant :return: labels, number of clustering iterations needed, smallest set of cluster found, execution time """ start = timer() connectivity = kneighbors_graph(adj_matrix, n_neighbors=10, include_self=True) affinity_matrix_ = 0.5 * (connectivity + connectivity.T) eigen_vectors = spectral_embedding(affinity_matrix_, n_components=n_clusters, eigen_solver='arpack', eigen_tol=0.0, norm_laplacian=True, drop_first=False) _, labels, _, num_iterations = k_means(eigen_vectors, n_clusters=n_clusters, return_n_iter=True) end = timer() execution_time = end - start smallest_cluster_size = min(np.sum(labels), abs(np.sum(labels) - labels.size)) return labels, num_iterations, smallest_cluster_size, execution_time
def fun(train, test): granular_balls = GBList(train, train) # Build granular balls granular_balls.init_granular_balls() # Initialize granular balls, divide granular balls according to purity ball_list = granular_balls.granular_balls Ball_list = funtion(ball_list) # Continue to divide the granular balls with overlapping boundaries while True: init_center = [] Ball_num1 = len(Ball_list) # Count the number of granular balls for i in range(len(Ball_list)): init_center.append(Ball_list[i].center) ClusterLists = k_means(X=train[:, 1:-1], init=np.array(init_center), n_clusters=len(Ball_list)) data_label = ClusterLists[1] ball_list = [] for i in set(data_label): Cluster_data = train[data_label == i, :] ball_list.append(GranularBall(Cluster_data)) Ball_list = funtion(ball_list) Ball_num2 = len(Ball_list) # Number of granular balls after statistical division if Ball_num1 == Ball_num2: # Stop if the number of granular balls no longer changes break # plot_gb(Ball_list) # Visualize two-dimensional granular balls (using data.csv data set) ball_num = len(Ball_list) # Number of granular balls generated time1 = time.time() count_num = nearest_knn(Ball_list, test) # Test nearest neighbor accuracy time2 = time.time() test_time = time2 - time1 # Statistical classification time return count_num, ball_num, test_time
def testGeneratedData(): X, y = make_blobs(n_samples=1000, centers=np.array([[-2.5, 0], [2.5, 0], [0, 2.5], [0, -2.5]])) n_clusters = np.unique(y).shape[0] n_instances = X.shape[0] # Delete some seeds: yMod = np.copy(y) for i in range(n_instances): if np.random.rand() > 0.5: yMod[i] = -1 # Means no class sm = ConstrainedKMeans(n_clusters=n_clusters, max_iter=2000, verbose=1) sm.fit(X, yMod) predictedLabels = sm.predict(X) trueLabels = y adjustedRandScore = metrics.adjusted_rand_score(trueLabels, predictedLabels) print('Constrained KMeans adjusted rand score: %s' % (adjustedRandScore)) (centers, predictedLabels, inertia, best_n_iter) = k_means(X, n_clusters, n_init=1, return_n_iter=True) print('Number of iterations: %s' % (best_n_iter)) # Find the correct predictions, the permutations that maximizes the accuracy: # predictedLabels = kmeans.labels_ trueLabels = y adjustedRandScore = metrics.adjusted_rand_score(trueLabels, predictedLabels) print('KMeans adjusted rand score: %s' % (adjustedRandScore)) tColour = tuple([0, 1, 0]) plt.scatter(X[y == predictedLabels, 0], X[y == predictedLabels, 1], c=tColour, alpha=0.5) tColour = tuple([1, 0, 0]) plt.scatter(X[y != predictedLabels, 0], X[y != predictedLabels, 1], c=tColour, alpha=0.5) plt.show()
def KMeansCluster(matrix): """ Performs the K-Means cluster given a matrix of data @param[in]: matrix, List of List(s) """ # Possibly need to scale the data first data = scale(matrix) # Approximate the number of clusters using c = root(n/2) # num_clusters = int(sqrt(len(matrix) / 2)) num_clusters = 5 number_init = 10 # Default number_iter = 300 num_cpus = 2 print "===================" print "Training KMeans with (num_clusters, num_init, num_iters, num_cpus)" print num_clusters, number_init, number_iter, num_cpus # estimator = KMeans(init='k-means++', n_clusters = num_clusters, n_init = number_init) # estimator.fit(data) # clusters = k_means(data, n_clusters = num_clusters, max_iter=number_iter, n_init = number_iter, # init='k-means++', n_jobs = num_cpus) clusters = k_means(data, n_clusters = num_clusters, max_iter=number_iter, n_init = number_iter, n_jobs = num_cpus) return clusters
def kmeans_estimate(x): labels = k_means(x, 2)[1] # print labels cluster = [[], []] for i in xrange(len(labels)): cluster[labels[i]].append(x[i]) cluster[0] = np.array(cluster[0]) cluster[1] = np.array(cluster[1]) pi = len(cluster[0]) * 1.0 / len(labels) pis = [pi, 1 - pi] means = [] sigmas = [] for i in xrange(2): curr = cluster[i] mean = np.average(curr, axis=0) means.append(mean) sigma = np.zeros((2, 2)) for i in xrange(len(curr)): y = np.reshape(curr[i] - mean, (2, 1)) sigma += np.dot(y, y.T) sigma /= len(curr) sigmas.append(sigma) pis = np.array(pis) means = np.array(means) sigmas = np.array(sigmas) print means return pis, means, sigmas
def clusterValidity(X, y): # Maximum number of clusters: K = 10 # Allocate variables: Rand = np.zeros((K, )) Jaccard = np.zeros((K, )) NMI = np.zeros((K, )) for k in range(K): # run K-means clustering: #cls = Pycluster.kcluster(X,k+1)[0] centroids, cls, inertia = k_means(X, k + 1) # compute cluster validities: Rand[k], Jaccard[k], NMI[k] = clusterval(y, cls) # Plot results: figure(1) title('Cluster validity') plot(np.arange(K) + 1, Rand) plot(np.arange(K) + 1, Jaccard) plot(np.arange(K) + 1, NMI) ylim(-2, 1.1) legend(['Rand', 'Jaccard', 'NMI'], loc=4) show()
def main(): import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs n_centers = 3 X, y = make_blobs(n_samples=1000, centers=n_centers, n_features=2, cluster_std=0.7, random_state=0) # Run this K-Means import kmeans t0 = time.time() y_pred, centers, obj_val_seq = kmeans.kmeans(X, n_centers) t1 = time.time() print("Final obj val: {}".format(obj_val_seq[-1])) print("Time taken (this implementation): {}".format(t1 - t0)) # Run scikit-learn's K-Means from sklearn.cluster import k_means t0 = time.time() centers, y_pred, obj_val = k_means(X, n_centers, random_state=0) t1 = time.time() print("Final obj val: {}".format(obj_val)) print("Time taken (Scikit, 1 job): {}".format(t1 - t0)) # Plot change in objective value over iteration fig = plt.figure() ax = fig.add_subplot(111) ax.plot(obj_val_seq, 'b-', marker='*') fig.suptitle("Change in K-means objective value across iterations") ax.set_xlabel("Iteration") ax.set_ylabel("Objective value") fig.show() # Plot data from itertools import cycle colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') fig = plt.figure(figsize=plt.figaspect(0.5)) # Make twice as wide to accomodate both plots ax = fig.add_subplot(121) ax.set_title("Data with true labels and final centers") for k, color in zip(range(n_centers), colors): ax.plot(X[y==k, 0], X[y==k, 1], color + '.') initial_centers = kmeans.init_centers(X, n_centers, 2) # This is valid because we always use the same random seed. # Plot initial centers for x in initial_centers: ax.plot(x[0], x[1], "mo", markeredgecolor="k", markersize=8) # Plot final centers for x in centers: ax.plot(x[0], x[1], "co", markeredgecolor="k", markersize=8) # Plot assignments colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') ax = fig.add_subplot(122) ax.set_title("Data with final assignments") for k, color in zip(range(n_centers), colors): ax.plot(X[y_pred==k, 0], X[y_pred==k, 1], color + '.') fig.tight_layout() fig.gca() fig.show()
def DrawSsePlot(self, title, data, random_state): chart_folder = self.dir_folder + title + ".png" chart_folder_re = self.upload_folder + title + ".png" sse = [] # 手肘法则 lunkuo = [] # 轮廓系数存放距离 start, end = 3, 15 for i in range(start, end): km = k_means(data, n_clusters=i, random_state=random_state) sse.append(km[2]) lunkuo.append(silhouette_score(data, km[1], metric='euclidean')) fig, ax1 = plt.subplots(figsize=(10, 7)) ax2 = ax1.twinx() lns1 = ax1.plot(range(start, end), sse, 'o-', c='g', label='zhou-bu') lns2 = ax2.plot(range(start, end), lunkuo, 'o-', c='r', label='lun-kuo') new_ticks = np.linspace(start, end, end - start + 1) plt.xticks(new_ticks) lns = lns1 + lns2 labs = [l.get_label() for l in lns] ax1.legend(lns, labs, loc=0) ax1.set_xlabel('K') ax1.set_ylabel('SSE') ax2.set_ylabel('LUN-KUO-INDEX') plt.savefig(chart_folder) plt.savefig(chart_folder_re) plt.cla() plt.clf() return chart_folder
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, sample_weight=None, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, sample_weight=None, init=centers) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1, sample_weight=None) # kmeans for algorithm='elkan' raises TypeError on sparse matrix assert_raise_message(TypeError, "algorithm='elkan' not supported for " "sparse input X", k_means, X=X_csr, n_clusters=2, sample_weight=None, algorithm="elkan")
def load_semantic_map(): target_mask_img = img_to_array( load_img(target_mask_path, target_size=(img_nrows, img_ncols))) style_mask_img = img_to_array( load_img(style_mask_path, target_size=(img_nrows, img_ncols))) if K.image_dim_ordering() == 'th': mask_vecs = np.vstack([ style_mask_img.reshape((3, -1)).T, target_mask_img.reshape((3, -1)).T ]) else: mask_vecs = np.vstack([ style_mask_img.reshape((-1, 3)), target_mask_img.reshape((-1, 3)) ]) #_, labels = vq.kmeans2(mask_vecs, nb_labels, missing='raise') _, labels, _ = k_means(mask_vecs.astype("float64"), nb_labels) style_mask_label = labels[:img_nrows * img_ncols].reshape( (img_nrows, img_ncols)) target_mask_label = labels[img_nrows * img_ncols:].reshape( (img_nrows, img_ncols)) style_mask = np.stack([style_mask_label == r for r in range(nb_labels)], axis=0) target_mask = np.stack([target_mask_label == r for r in range(nb_labels)], axis=0) return np.expand_dims(style_mask, axis=0), np.expand_dims(target_mask, axis=0)
def generate_cluster(self): for i in range(self.size): cod = self.get_coordinate() self.G.add_node(i, pos=cod) for i in range(len(self.ls)): self.dic[i] = [] for j in range(len(self.ls)): val = self.get_dist(self.ls[i], self.ls[j]) self.dic[i].append(round(val)) self.cluster = DBSCAN(eps=4, min_samples=1).fit_predict(np.array(self.ls)) n_clus = max(self.cluster) + 1 temp = k_means(np.array(self.ls), n_clusters=n_clus) self.centroid = temp[0] self.cluster = temp[1] for i in range(len(self.centroid)): for j in range(2): self.centroid[i][j] = int(round(self.centroid[i][j])) for i in range(max(self.cluster) + 1): self.final[i] = [] for j in range(len(self.cluster)): if (i == self.cluster[j]): self.final[i].append(j) for i in range(len(self.final)): self.cluster_head.append(self.final[i][self.getClusterHead(self.final[i], self.centroid[i])]) return self.G
def getNPointsInRegion(self, NPoints, n=89, s=-60, e=180, w=-180, fidelity=1): ''' :param NPoints: Number of coordinates you want to find :param n: Northern Latitude :param s: Southern Latitude :param e: Eastern Longitude :param w: Western Longitude :param fidelity: granularity of the coordinate grid returned (smaller means more points) :return: Pandas DataFrame of the points found to be land, Pandas DataFrame the equidistant coordinates within this region ''' globe_m = self.getPointsBetween(n, s, e, w, fidelity) land = np.squeeze(np.asarray(globe_m[:, 2] == 1)) land_m = globe_m[land] myCL = cluster.k_means(land_m, NPoints) centroids = myCL[0] return pd.DataFrame(land_m).iloc[:, 0:2], pd.DataFrame( centroids).iloc[:, 0:2]
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, init=centers) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
def smythEmissionDistribution(pair): """ Given a pair (S: list of sequences, target_m: int), get the emission distribution for Smyth's "default" HMM. target_m is an upper bound on the number of states -- if we can only have m' distinct observation values, then the distribution for a m' state HMM is returned. @param pair: A tuple of the form (S: list of sequences, m: int) @return: The corresponding emission distribution encoded as a list of (mu, stddev) pairs """ S, target_m = pair merged, distinct = prepareSeqs(S) m_prime = min(target_m, len(distinct)) centroids, labels, inertia = k_means(merged, m_prime, init='k-means++') clusters = partition(merged, labels) B = [] has_zero = False for cluster in clusters: assert len(cluster) > 0 mu = mean(cluster) stddev = std(cluster) B.append((mu, stddev)) if stddev < 0.001: has_zero = True return (B, labels, has_zero)
def get_jackknife_randoms(N_jack, catalog_data, generate_randoms, ra='ra', dec='dec'): """ Computes the jackknife regions and random catalogs for each region Parameters ---------- N_jack : number of regions catalog_data : input catalog generate_randoms: function to generate randoms (eg self.generate_processed_randoms) Returns ------- jack_labels: array of regions in catalog data randoms: dict of randoms labeled by region """ #cluster nn = np.stack((catalog_data[ra], catalog_data[dec]), axis=1) _, jack_labels, _ = k_means(n_clusters=N_jack, random_state=0, X=nn) randoms = {} for nj in range(N_jack): catalog_data_jk = dict( zip(catalog_data.keys(), [v[(jack_labels != nj)] for v in catalog_data.values()])) rand_cat, rr = generate_randoms( catalog_data_jk) #get randoms for this footprint randoms[str(nj)] = {'ran': rand_cat, 'rr': rr} return jack_labels, randoms
def mosquito_init(sample_space, dim_count, num_particles, num_swarms=5): if num_swarms > num_particles: num_swarms = num_particles low_bound, high_bound = sample_space mosquitos = np.random.uniform(low_bound, high_bound, size=(num_particles, dim_count)) #leaders, swarms = kmeans(mosquitos, num_centroids=num_swarms) #print swarms #print res = k_means(mosquitos, num_swarms) swarm_idx = res[1] swarms = [[] for _ in xrange(num_swarms)] for idx in xrange(len(swarm_idx)): swarms[swarm_idx[idx]].append(mosquitos[idx]) # print swarms # exit(0) starvation = [0.0 for _ in xrange(num_swarms)] return [(np.inf, None) for _ in xrange(num_swarms)], swarms, starvation, num_particles
def determine_k(item): print '' nums = 10 #平均轮廓系数计算次数 max_k = 10 min_k = 2 result = [0] * (max_k - min_k) data = data_pre(item) for j in range(nums): scs = [] for i in range(min_k, max_k): [centroid, label, inertia] = cluster.k_means(data, i) sc = silhouette_score(data, label, metric='euclidean') # 平均轮廓系数 scs.append(sc) result[i - min_k] += sc for i in range(len(scs)): result[i] = result[i] / nums temp = pd.DataFrame(result) temp.to_csv('data/Silhouette' + str(item) + '.csv', index=False) #index设置是否显示行号 plt.plot(result, 'rx-') plt.title('scs' + str(nums)) plt.xlabel('the numbers of clusters') plt.ylabel('Silhouette Coefficient') plt.savefig('img/Silhouette Coefficient.png') #plt.show() plt.close()
def example_4(): """ compare to scikitlearn implementation of kmeans """ import sklearn.cluster as skc import time ndata = 50000 dimension = 10 ncentroids = 1000 data = npr.randn(ndata, dimension).astype(np.float64) centroids0 = data[0:ncentroids, :] t0 = time.time() kmeans.get_clustering(X = data, init = centroids0, n_clusters = ncentroids, algorithm = 'auto', verbose = 1, n_threads = 1) t1 = time.time() sklearner = skc.k_means(X = data, n_clusters = ncentroids, max_iter = 1000, n_init = 1, init = centroids0, precompute_distances = False, verbose = True, n_jobs = 1, return_n_iter = True, tol = 0.0) t2 = time.time() kmeans_time = t1 - t0 sklearner_time = t2 - t1 print "sklearn : ", sklearner_time, " s" print "this kmeans: ", kmeans_time, " s"
def cluster_samples(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] components = map(lambda idx: idx - 1, args.components) selected = projected[:, components] features = read_features(workdir) _, labels, inertia = k_means(selected, args.n_clusters, n_jobs=-2) fig_flname = os.path.join(analysis_dir, "clusters_%s.tsv" % args.n_clusters) clusters = defaultdict(list) for name, cluster in zip(features.sample_labels, labels): clusters[cluster].append(name) with open(fig_flname, "w") as fl: for cluster, samples in clusters.iteritems(): fl.write(str(cluster)) fl.write(",") fl.write(",".join(samples)) fl.write("\n")
def sweep_clusters(args): workdir = args.workdir figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] components = map(lambda idx: idx - 1, args.components) selected = projected[:, components] features = read_features(workdir) inertia_values = [] for k in args.n_clusters: print "Clustering with %s clusters" % k _, _, inertia = k_means(selected, k, n_jobs=-2) inertia_values.append(inertia) plt.plot(args.n_clusters, inertia_values, "k.-") plt.xlabel("Number of Clusters", fontsize=16) plt.ylabel("Inertia", fontsize=16) fig_flname = os.path.join(figures_dir, "cluster_inertia") for dim in args.components: fig_flname += "_%s" % dim fig_flname += ".png" plt.savefig(fig_flname, DPI=300)
def cluster_GSOM(self, gsom_map, n_clusters=2): """ Parameters ---------- gsom_map : growing self organizing map 2D array of weight vectors in SOM. n_clusters : number of clusters. Returns ------- gsom_list : list list of the gsom nodes centroid : list cluster centroids. labels : list cluster label w.r.t. gsom node data-point as in gsom_list """ gsom_list = self._gsom_to_array(gsom_map) clf = k_means(gsom_list, n_clusters=n_clusters) centroids = clf[0] labels = clf[1] return gsom_list, centroids, labels
def action_execute_button_clicked(self): #打开影像 input_img = gdal.Open(self.input_file_path.text()) img_rows = input_img.RasterYSize img_cols = input_img.RasterXSize img_bands = input_img.RasterCount img_geotrans = input_img.GetGeoTransform() img_proj = input_img.GetProjection() # 将影像转为k_means函数接受的数据格式 input_features = [] for i in range(1, img_bands + 1): band_img = input_img.GetRasterBand(i).ReadAsArray( 0, 0, img_cols, img_rows) input_features.append(band_img.reshape(-1)) input_features = np.array(input_features).T #执行k_means算法 kmeans_result = k_means(input_features, int(self.cluster_num.currentText()), max_iter=int(self.iter_num.currentText())) #将各样本点灰度值转为对应聚类中心灰度值 cluster_centers, clustered_points, _ = kmeans_result output_feature = [] for index, item in enumerate(clustered_points): while item > len(self.color_list) - 1: self.color_list.append(list(np.random.randint(256, size=3))) output_feature.append(self.color_list[item]) output_feature = np.array(output_feature).T output_feature = np.array( list(map(lambda x: x.reshape((img_rows, img_cols)), output_feature))) #输出聚类影像 driver = gdal.GetDriverByName("GTiff") output_img = driver.Create(self.output_file_path.text(), img_cols, img_rows, 3, gdal.GDT_Byte) output_img.SetGeoTransform(img_geotrans) output_img.SetProjection(img_proj) for i in range(1, 4): output_img.GetRasterBand(i).WriteArray(output_feature[i - 1]) del output_img layer_legends = [] # 图例数组 for i in range(len(cluster_centers)): layer_legends.append({ 'name': 'Cluster' + str(i + 1), 'color': QColor(self.color_list[i][0], self.color_list[i][1], self.color_list[i][2]) }) if (QMessageBox.question(self, "消息框", "聚类完成,是否将结果添加到图层?", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes) == QMessageBox.Yes): self.add_layer_signal.emit(self.output_file_path.text(), layer_legends)
def testRealData(): sDirname = os.path.dirname(os.path.abspath(__file__)) dfAspen = pd.read_csv( os.path.join(sDirname, '..', 'datasets', 'aspen.csv'), ';') dfAspen = dfAspen.dropna() dfAspen = dfAspen.reindex(np.random.permutation(dfAspen.index)) asCities = dfAspen['address_city'].unique() X = dfAspen[['location_coordinates_0', 'location_coordinates_1']].as_matrix() Y = dfAspen['address_city'].as_matrix() Y = np.array([asCities.tolist().index(sCity) for sCity in Y ]) # Convert array of strings to sequential integers # Convert coordinates to x,y grid using the Equirectangular projection X = X * np.pi / 180 fMeanLatitude = X.mean(0)[0] X[:, 0] = X[:, 0] * np.cos(fMeanLatitude) # First x, then y: Xaux = np.copy(X) X[:, 1] = Xaux[:, 0] X[:, 0] = Xaux[:, 1] n_clusters = len(asCities) # K-means: (centers, PredictedLabels, inertia, best_n_iter) = k_means(X, n_clusters, n_init=1, return_n_iter=True) adjustedRandScore = metrics.adjusted_rand_score(Y, PredictedLabels) print('KMeans adjusted rand score: %s' % (adjustedRandScore)) print('Number of iterations: %s' % (best_n_iter)) # Seeded k-means with all seeds: # Drop some seeds: fRatio = 0 dMapping = {} maxSeed = -1 SomeSeeds = np.repeat(-1, len(Y)) for i in range(len(Y)): if np.random.rand() > fRatio: continue iCity = Y[i] if iCity not in dMapping: maxSeed += 1 dMapping[iCity] = maxSeed SomeSeeds[i] = dMapping[iCity] sm = SeededKMeans(n_clusters=n_clusters, max_iter=2000, verbose=1) sm.fit(X, SomeSeeds) PredictedLabels = sm.predict(X) adjustedRandScore = metrics.adjusted_rand_score(Y, PredictedLabels) print('Seeded KMeans adjusted rand score: %s' % (adjustedRandScore)) for predictedLabel in np.unique(PredictedLabels): plt.scatter(X[PredictedLabels == predictedLabel, 0], X[PredictedLabels == predictedLabel, 1], color=(np.random.rand(), np.random.rand(), np.random.rand()), alpha=1) plt.show()
def get_point_centroids(indata,K,D): mean = numpy.zeros((indata.shape[1],D)) for n in xrange(0,(indata.shape[1])): for i in xrange(0,(indata.shape[2])): for j in xrange(0,D): mean[n][j] = mean[n][j] + indata[j][n][i] mean[n] = mean[n]/(indata.shape[2]) (centroids,x,y)=k_means(mean,K) #random order. change n_jobs to speed up return centroids
def twoClassTrain(self,data1,data2): #y is a N * 1 matrix data = data1 + data2 y = [[1 if d[0] == data1[0][0] else -1] for d in data] X = [d[1:] for d in data] centroid, label, inertia = cluster.k_means(X,self.k) phi = self.genTransferMatrix(X,centroid) w = numpy.dot(numpy.linalg.pinv(phi),y) return w, centroid
def learn(self, Xtrain, ytrain): """ Learns using the traindata """ Xless = Xtrain[:,self.features] lam = 100 self.centers = cluster.k_means(Xless, 10)[0] Xless = np.dot(Xless, self.centers.T) self.weights = np.dot(np.dot(np.linalg.inv(np.dot(Xless.T,Xless) + lam*np.identity(Xless.shape[1])), Xless.T),ytrain)
def kmeans(xs, k): assert xs.ndim == 2 try: from sklearn.cluster import k_means _, labels, _ = k_means(xs.astype("float64"), k) except ImportError: from scipy.cluster.vq import kmeans2 _, labels = kmeans2(xs, k, missing='raise') return labels
def initial_means(self,X,n_clusters): mean_KMeans_initial,label, intertia = k_means(X, n_clusters, random_state = 1) KernelKMeans_model = KernelKMeans(n_clusters=n_clusters, random_state=1, kernel="rbf", gamma=None, coef0=1, verbose=0) KernelKMeans_model.fit(X) mean_KernelKMeans_initial = self.pre_image(X, KernelKMeans_model.labels_, KernelKMeans_model.gamma, n_clusters, 100) return mean_KMeans_initial , mean_KernelKMeans_initial
def learn(self, Xtrain, ytrain): """ Learns using the traindata """ Xless = Xtrain[:,self.features] lam = 100 # set lambda for regularizer coefficient self.centers = cluster.k_means(Xless, 10)[0] Xless = (1 + np.dot(Xless, self.centers.T))**self.d self.weights = np.dot(np.dot(np.linalg.inv(np.dot(Xless.T,Xless) + lam*np.identity(Xless.shape[1])), Xless.T),ytrain)
def gap(data, refs=None, nrefs=20, ks=range(1,11)): """ I: NumPy array, reference matrix, number of reference boxes, number of clusters to test O: Gaps NumPy array, Ks input list Give the list of k-values for which you want to compute the statistic in ks. By Gap Statistic from Tibshirani, Walther. """ shape = data.shape if not refs: tops = data.max(axis=0) bottoms = data.min(axis=0) dists = scipy.matrix(scipy.diag(tops - bottoms)) rands = scipy.random.random_sample(size=(shape[0], shape[1], nrefs)) for i in range(nrefs): rands[:, :, i] = rands[:, :, i] * dists + bottoms else: rands = refs gaps = scipy.zeros((len(ks),)) for (i,k) in enumerate(ks): k_means_args_dict['n_clusters'] = k kmeans = k_means(**k_means_args_dict) kmeans.fit(data) (cluster_centers, point_labels) = kmeans.cluster_centers_, kmeans.labels_ disp = sum([dst(data[current_row_index, :], cluster_centers[point_labels[current_row_index],:]) for current_row_index in range(shape[0])]) refdisps = scipy.zeros((rands.shape[2],)) for j in range(rands.shape[2]): kmeans = k_means(**k_means_args_dict) kmeans.fit(rands[:, : ,j]) (cluster_centers, point_labels) = kmeans.cluster_centers_, kmeans.labels_ refdisps[j] = sum([dst(rands[current_row_index,:,j], cluster_centers[point_labels[current_row_index],:]) for current_row_index in range(shape[0])]) #let k be the index of the array 'gaps' gaps[i] = scipy.mean(scipy.log(refdisps)) - scipy.log(disp) return ks, gaps
def cluster_spatial_data(X, n_parcels, xyz=None, shape=None, mask=None, method='ward', verbose=False): """Cluster the data using Ward's algorithm Parameters ========== X: array of shape(n_voxels, n_subjects) the functional data, across subjects n_parcels: int, the desired number of parcels xyz: array of shape (n_voxels, 3), optional positions of the voxels in grid coordinates shape: tuple: the domain shape (assuming a grid structure), optional alternative specification of positions mask: arbitrary array of arbitrary dimension,optional alternative specification of positions method: string, one of ['ward', 'spectral', 'kmeans'], optional clustering method Returns ======= label: array of shape(n_voxels): the resulting cluster assignment Note ==== One of xyz, shape or mask needs to be provided """ from sklearn.cluster import spectral_clustering, k_means if mask is not None: connectivity = grid_to_graph(*shape, mask=mask) elif shape is not None: connectivity = grid_to_graph(*shape) elif xyz is not None: from sklearn.neighbors import kneighbors_graph n_neighbors = 2 * xyz.shape[1] connectivity = kneighbors_graph(xyz, n_neighbors=n_neighbors) else: raise ValueError('One of mask, shape or xyz has to be provided') if n_parcels == 1: return np.zeros(X.shape[0]) if method == 'ward': connectivity = connectivity.tocsr() ward = Ward(n_clusters=n_parcels, connectivity=connectivity).fit(X) label = ward.labels_ elif method == 'spectral': i, j = connectivity.nonzero() sigma = np.sum((X[i] - X[j]) ** 2, 1).mean() connectivity.data = np.exp(- np.sum((X[i] - X[j]) ** 2, 1) / (2 * sigma)) label = spectral_clustering(connectivity, n_clusters=n_parcels) elif method == 'kmeans': _, label, _ = k_means(X, n_parcels) else: raise ValueError('Unknown method for parcellation') return label
def cluster_index_2(X): global_mean = np.mean(X, axis=0) sum_squared_distances = (((X - global_mean) ** 2).sum(axis=1)).sum() # Sum of squared distances of each sample from the global mean centroids, labels, inertia = k_means(X, 2) ci = inertia / sum_squared_distances return ci, labels
def KMEANS(data, k): if data.shape[0] < 20000: centroids, cluster_IDs, _ = k_means(data, k, init = 'k-means++', precompute_distances = 'auto', n_init = 20, max_iter = 200) else: mbkm = MiniBatchKMeans(k, 'k-means++', max_iter = 100, batch_size = data.shape[0] / k, n_init = 20) mbkm.fit(data) centroids = mbkm.cluster_centers_ cluster_IDs = mbkm.labels_ return centroids, cluster_IDs
def smythEmissionDistribution(pair): """ Given a pair (S: list of sequences, target_m: int), get the emission distribution for Smyth's "default" HMM. target_m is an upper bound on the number of states -- if we can only have m' distinct observation values, then the distribution for a m' state HMM is returned. @param pair: A tuple of the form (S: list of sequences, target_m: int) @return: (B, labels, has_zero), where: * S', obs = concat(S), set(S) * m' = min(target_m, len(obs)) * [C_0,...,C_{m'-1}] = result of clustering S' with k-means. * labels: tells which cluster each item in merged goes into; i.e., labels[i] = j, where S'[i] belongs to cluster C_j. * B[i] = (mean(C_i), stddev(C_i)). * has_zero = True if there is i such that B[i][1] ~= 0.0. """ S, target_m = pair # merged list of 1d vectors, set of distinct observation values merged, distinct = prepareSeqs(S) # m_prime is min of either target_m or the number of distinct obs values m_prime = min(target_m, len(distinct)) # k-means partitions merged into m_prime clusters [C_0,...,C_{m'-1}]. # centroids = [c_0,...,c_{m'-1}]: cluster centers; i.e., c_i is the center # of C_j. # labels: tells which cluster each item in merged goes into; i.e., # labels[i] = j, where merged[i] belongs to cluster C_j. # inertia: sum of distances of samples to closest cluster center # inertia = sum_{i=0}^{m'-1}(sum_{x in C_i} dist(x, c_i)). centroids, labels, inertia = k_means(merged, m_prime, init='k-means++') # takes labels and arranges merged into # a list of lists, each of which contains the series from one cluster # clusters = [C_0,..,C_{m'-1}] clusters = partition(merged, labels) # Compute (B, labels, has_zero), where # B[i] = (mean(C_i), stddev(C_i)). # has_zero = True if there is i such that B[i][1] ~= 0.0. B = [] has_zero = False for cluster in clusters: assert len(cluster) > 0 mu = mean(cluster) stddev = std(cluster) B.append((mu, stddev)) if stddev < 0.001: has_zero = True return (B, labels, has_zero)
def test_k_means(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.k_means(3, random_state=self.random_state) expected = cluster.k_means(iris.data, 3, random_state=self.random_state) self.assertEqual(len(result), 3) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertTrue(isinstance(result[1], pdml.ModelSeries)) self.assert_index_equal(result[1].index, df.index) self.assert_numpy_array_equal(result[1].values, expected[1]) self.assertAlmostEqual(result[2], expected[2])
def kcluster(content, num_cluster = 6, num_key = 15, single = False, num_top = -1): # num_key is the number of keyword that we extract from a cluster # we can find the union of the extracted keys from each cluster one_hot_tokens, weight_array, mapping_back, postprocess_sentences, C = get_rakeweight_data(content) # in case we have less vector than cluste number num_cluster = min(num_cluster, len(weight_array)) token_weights = defaultdict(float) keyword_weights = defaultdict(float) k_clusters = cluster.k_means(weight_array, num_cluster)[0] union_array = [] keywords = [] num_key = min(num_key, len(one_hot_tokens)) for i,vec in enumerate(k_clusters): tmp = sorted(range(len(vec)), key=lambda i: vec[i])[-num_key:] union_array = list(set(tmp) | set(union_array)) for ind in tmp: token = one_hot_tokens[ind] degree = sum(C[token].values()) freq = C[token][token] # currently degree. update to different weight scheme if needed token_weights[token] += float(degree) for ind in union_array: token = one_hot_tokens[ind] keyword = mapping_back[token] keywords.append(keyword.encode('ascii')) # for all tokens that map to keyword keyword_weights[keyword] += token_weights[token] keywords = list(set(keywords)) if single: keywords = set(keywords) if (num_top < 0): num_top = len(keywords) return random.sample(keywords, min(len(keywords), num_top)) # get keyphrases keyphrases,keyphrase_freq = get_keyphrases(keywords, postprocess_sentences) # keyphrases_weights = sum keyword_weights[word] / total_words # for all words in keywords keyphrases_weights = get_keyphrase_weights(keyphrases, keyword_weights, keyphrase_freq) keyword_weights.update(keyphrases_weights) if num_top < 0: num_top = len(keyword_weights)/3 top_keywords = sorted(keyword_weights, key=keyword_weights.get, reverse=True)[:min(num_top, len(keyword_weights))] # for keyword in top_keywords: # print(keyword + ' '*(40-len(keyword)) + str(keyword_weights[keyword])) return top_keywords
def call_k_means(data, n_clusters): """ k_means(X, n_clusters, init='k-means++', precompute_distances=True, n_init=10, max_iter=300, verbose=False, tol=0.0001, random_state=None, copy_x=True, n_jobs=1) 返回: centroid label inertia """ k_clusters, k_labels, k_dis = cluster.k_means(data, n_clusters) print "中心点坐标:\n", k_clusters print "类标签:\n", k_labels print "距离:\n", k_dis return k_labels
def kcluster(mapping_back, num_cluster, weight_array, one_hot_tokens, num_key = 3): # num_key is the number of keyword that we extract from a cluster # we can find the union of the extracted keys from each cluster # in case we have less vector than cluste number num_cluster = min(num_cluster, len(weight_array)) k_clusters = cluster.k_means(weight_array, num_cluster)[0] union_array = [] num_key = min(num_key, len(one_hot_tokens)) for i,vec in enumerate(k_clusters): tmp = sorted(range(len(vec)), key=lambda i: vec[i])[-num_key:] union_array = list(set(tmp) | set(union_array)) res = [] for ind in union_array: res.append(mapping_back[one_hot_tokens[ind]]) return res
def learn(self, Xtrain, ytrain): """ Learns using the traindata """ Xless = Xtrain[:,self.features] lam = 1000 # set regularizer coeff num = Xless.shape[0] XKer = np.zeros((num, 10)) self.centers = cluster.k_means(Xless, 10)[0] #transform Data for i in range(num): for j in range(10): XKer[i, j] = np.exp(-(np.linalg.norm(Xless[i, ] - self.centers[j]))/2*self.s**2) self.weights = np.dot(np.dot(np.linalg.inv(np.dot(XKer.T,XKer) + lam*np.identity(XKer.shape[1])), XKer.T),ytrain)